Exemplo n.º 1
0
def extract_most_intense(
        h5_file_name: str, molecule_id: str, adduct: str, rt_min: float,
        rt_max: float,
        mz_tol: float) -> Tuple[Spectrum, float, float, float, str]:
    """
    Inputs:
        molecule_id: either inchi or smiles string
        mz_tol: mz_tolerance in units of ppm
    Returns Spectrum, RT, parent_mass, precursor_mz, collision_energy
    """
    mol = cheminfo.inchi_or_smiles_to_molecule(molecule_id)
    parent_mass = ExactMolWt(mol)
    precursor_mz = cheminfo.get_precursor_mz(parent_mass, adduct)
    h5_data = ma_data.df_container_from_metatlas_file(h5_file_name)
    msms_df = h5_data["ms2_pos"] if cheminfo.is_positive_mode(
        adduct) else h5_data["ms2_neg"]
    in_tol_df = msms_df.groupby(GROUP_SPECTRUM_COLS).filter(
        lambda x: in_rt_mz_ranges(x.iloc[0]["rt"], rt_min, rt_max, x.iloc[0][
            "precursor_MZ"], precursor_mz, mz_tol))
    precursor_intensity_max = in_tol_df["precursor_intensity"].max()
    most_intense_df = in_tol_df.groupby(GROUP_SPECTRUM_COLS).filter(
        lambda x: precursor_intensity_max == x.iloc[0]["precursor_intensity"])
    most_intense = most_intense_df.iloc[0]
    return (
        Spectrum(tuple(most_intense_df["mz"]), tuple(most_intense_df["i"])),
        most_intense["rt"],
        parent_mass,
        float(most_intense["precursor_MZ"]),
        get_collision_energy(h5_file_name),
    )
def calc_properties(smiles_list):
    logp_vals, mw_vals = [], []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        logp_vals.append(MolLogP(mol, True))
        mw_vals.append(ExactMolWt(mol))
    return pd.DataFrame({'logp': logp_vals, 'mw': mw_vals})
Exemplo n.º 3
0
    def ghose_filter(self, filepath, exclude_salt=False):
        r""" Filter the given file with ghose filter. If exclude_salt is true,
        the molecule with atoms not in the no_salt_atoms list will also be
        filtered out.
        file_path (str): path to the .mol2 file or .gz file.
        exclude_salt(bool): if filter out the molecule containg salt atoms.
        =======================================================================
        return (str): filtered string in Mol2 file format. 
        """
        reader = Mol2Reader(filepath)
        blocks = reader.get_blocks()
        filtered = list()
        for block in tqdm(blocks):
            mol = Chem.rdmolfiles.MolFromMol2Block(block, sanitize=False)
            if mol is None:
                continue
            n_atoms = mol.GetNumAtoms()
            if n_atoms < 20 or n_atoms > 70:
                continue
            mw = ExactMolWt(mol)
            if mw < 180 or mw > 480:
                continue
            if exclude_salt:
                atoms = mol.GetAtoms()
                flag = 0
                for atom in atoms:    
                    if atom.GetSymbol() not in self.no_salt_atoms:
                        flag = 1
                        break
                if flag == 1:
                    continue
            filtered.append(block)

        return "\n\n".join(filtered)
Exemplo n.º 4
0
def count_struct_isomers(smiles_list):
    """
	Counts the number of molecules with the same molecular formula
	Keyword arguments:
	smiles_list -- a list of smiles strings of the set/subset of molecules to look at
	Returns: 
	"""
    # formula: isomer count
    dict_isomers = {}
    # formula : smiles list
    dict_smiles = {}
    # weight : isomer count
    dict_exactwt = {}

    for mol_smiles in smiles_list:
        mol = MolFromSmiles(mol_smiles)
        formula = CalcMolFormula(mol)
        weight = ExactMolWt(mol)
        if formula in dict_isomers.keys():
            dict_isomers[formula] += 1  # increase the isomer count by 1
            dict_smiles[formula].append(
                mol_smiles)  # These are MOD's smiles, not RDKit's
            dict_exactwt[
                weight] += 1  # Weight calculated by RDKit, not MOD's in-built
        else:
            dict_isomers[formula] = 1
            dict_smiles[formula] = [mol_smiles]
            dict_exactwt[weight] = 1
    return dict_exactwt  # modify this as per your needs
Exemplo n.º 5
0
def get_canonical_strings(tautomer, tautomer_molecules):
    A = []  #list of sets
    B = []  #list of sets
    canonical_tautomer = []  #list that will have the final output

    for mi in tautomer_molecules:
        A.append(mi.GetNumAtoms())  # Give A a property
        B.append(ExactMolWt(mi))  # Give B a property

    set_A = list(set(A))  # Get the set of A
    set_B = list(set(B))  # Get the set of B
    assert (len(set_A) == len(set_B))
    num_of_taut = len(set_A)  # Get the number of tautomer (According to set)
    master = {}  # complex stuff

    for i in range(
            0, num_of_taut
    ):  # more complex stuff (based on Neural Net Weight initalization strategy)
        master['t' + str(i)] = "NONE"

    for i in range(0, num_of_taut):  #iterate over the numer of tautomers
        for a, b, c in zip(A, B, tautomer):  # zip A,B to iterate
            if a == set_A[i] and b == set_B[i]:  # If a & b belong to subset
                if master['t' +
                          str(i)] == "NONE":  # and there is no registration
                    master['t' + str(i)] = c  # create registration
                    canonical_tautomer.append(c)  #  append registration
                else:
                    canonical_tautomer.append(
                        master['t' +
                               str(i)])  # if not new append old registration
    return (canonical_tautomer, num_of_taut)
Exemplo n.º 6
0
    def preprocess(dataset, dir_input):

        train_smiles = list(dataset['SMILES'])
        train_adducts = dataset['Adducts']
        train_ccs = list(dataset['CCS'])

        adducts_encoder = AdductToOneHotEncoder()
        adducts_encoder.fit(train_adducts)
        adducts = adducts_encoder.transform(train_adducts)

        Smiles, molecules, adjacencies, properties, descriptors = '', [], [], [], []
        for i, smi in enumerate(train_smiles):
            if '.' in smi:
                continue
            smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))
            mol = Chem.MolFromSmiles(smi)
            mol = Chem.AddHs(mol)
            atoms = create_atoms(mol)
            i_jbond_dict = create_ijbonddict(mol)

            fingerprints = extract_fingerprints(atoms, i_jbond_dict, radius)
            adjacency = create_adjacency(mol)

            Smiles += smi + '\n'
            molecules.append(fingerprints)
            adjacencies.append(adjacency)
            properties.append([[train_ccs[i]]])
            descriptors.append([
                ExactMolWt(mol),
                MolLogP(mol),
                GetFormalCharge(mol),
                CalcNumRings(mol),
                CalcNumRotatableBonds(mol),
                CalcLogS(mol),
                AcidCount(mol),
                BaseCount(mol),
                APolar(mol),
                BPolar(mol)
            ])

        properties = np.array(properties)
        mean, std = np.mean(properties), np.std(properties)
        properties = np.array((properties - mean) / std)

        os.makedirs(dir_input, exist_ok=True)

        with open(dir_input + 'Smiles.txt', 'w') as f:
            f.write(Smiles)
        np.save(dir_input + 'molecules', molecules)
        np.save(dir_input + 'adducts', adducts)
        np.save(dir_input + 'adjacencies', adjacencies)
        np.save(dir_input + 'properties', properties)
        np.save(dir_input + 'descriptors', descriptors)
        np.save(dir_input + 'mean', mean)
        np.save(dir_input + 'std', std)
        dump_dictionary(fingerprint_dict,
                        dir_input + 'fingerprint_dict.pickle')
Exemplo n.º 7
0
def calc_properties(smi):
    # returns logP, TPSA, MW, MR
    m = Chem.MolFromSmiles(smi.numpy())
    logP = MolLogP(m)
    tpsa = CalcTPSA(m)
    # sas = calculateScore(m)
    mw = ExactMolWt(m)
    mr = MolMR(m)
    return np.asarray(logP), np.asarray(tpsa), np.asarray(mw), np.asarray(mr)
Exemplo n.º 8
0
    def _filter_by_mass_and_rt(
        self,
        possible_ranges: List[Tuple[float, float, str, str]],
        cpd_info: List[Tuple[str]],
    ) -> Tuple[Optional[str], Dict]:
        """Check to see if compound masses  (and optionally, retention time)
        each lie in any possible mass ranges.

        Parameters
        ----------
        possible_ranges : List[Tuple[float, float, str, str]]
            Possible mass ranges based on peak masses and tolerance.
        cpd_info : List[Tuple[str]]
            Tuple of compound ID, SMILES, peak ID, and adduct name.

        Returns
        -------
        c_id_if_matched : str, optional
            Contains the compound ID if a hit is found, None by default.
        cpd_dict : Dict
            Contains predicted retention time, matched peak IDs (if any), and
            matched adduct names (if any).
        """
        c_id_if_matched = None
        cpd_dict = {"Predicted_RT": None, "Matched_Peak_IDs": [], "Matched_Adducts": []}

        cpd_exact_mass = ExactMolWt(MolFromSmiles(cpd_info[1]))
        predicted_rt = None
        for possible_range in possible_ranges:
            if possible_range[0] < cpd_exact_mass < possible_range[1]:
                c_id = cpd_info[0]
                smiles = cpd_info[1]
                peak_id = possible_range[2]
                adduct = possible_range[3]

                if self.filter_by_rt:
                    if not predicted_rt:
                        predicted_rt = self._predict_rt(smiles)
                    if not predicted_rt:
                        # sometimes can't predict RT due to missing vals in fingerprint
                        continue

                    expt_rt = self.metabolomics_dataset.get_rt(peak_id)
                    if not expt_rt:
                        raise ValueError(f"No retention time found for peak, {peak_id}")

                    cpd_dict["Predicted_RT"] = predicted_rt
                    if abs(expt_rt - predicted_rt) > self.rt_threshold:
                        continue  # if outside threshold, don"t add to matched peaks

                c_id_if_matched = c_id
                cpd_dict["Matched_Peak_IDs"].append(peak_id)
                cpd_dict["Matched_Adducts"].append(adduct)

        return c_id_if_matched, cpd_dict
Exemplo n.º 9
0
def fill_calculated_fields(comp: metob.Compound, mol: Chem.rdchem.Mol) -> None:
    assert mol is not None
    comp.inchi_key = comp.inchi_key or Chem.inchi.InchiToInchiKey(comp.inchi)
    comp.formula = comp.formula or Chem.rdMolDescriptors.CalcMolFormula(mol)
    comp.mono_isotopic_molecular_weight = comp.mono_isotopic_molecular_weight or ExactMolWt(
        mol)
    comp.permanent_charge = comp.permanent_charge or Chem.GetFormalCharge(mol)
    comp.number_components = comp.number_components or 1  # type: ignore
    comp.num_free_radicals = comp.num_free_radicals or Chem.Descriptors.NumRadicalElectrons(
        mol)
    fill_neutralized_fields(comp, mol)
Exemplo n.º 10
0
 def _printinfo(mol):
     mol2 = Chem.AddHs(mol)
     AllChem.EmbedMolecule(mol2)
     OUTPUT_FILENAME = "output.mol"
     output_path = os.path.join(output_dir, OUTPUT_FILENAME)
     with open(output_path, "w") as fp:
         print(Chem.MolToMolBlock(mol2), file=fp)
     print(CONFORMATION_KEY, output_path)
     print(MOLW_KEY, ExactMolWt(mol2))
     print(ATOMCOUNT_KEY, mol2.GetNumAtoms())
     print(BONDCOUNT_KEY, mol2.GetNumBonds())
Exemplo n.º 11
0
def search(query: str, min_mw: float, max_mw: float,
           layout: widgets.Box) -> None:
    with get_new_log_box(layout):
        clear_search_output(layout)
        results = get_synonym_matches(query)
        for cur in results:
            RDLogger.DisableLog("rdApp.*")  # hide rdkit warnings
            cur["mol"] = cheminfo.normalize_molecule(
                Chem.inchi.MolFromInchi(cur["inchi"]))
            cur["norm_inchi"] = Chem.inchi.MolToInchi(cur["mol"])
            RDLogger.EnableLog("rdApp.*")
            cur["MW"] = ExactMolWt(cur["mol"])
        filtered = filter_by_mw(filter_to_norm_inchi_in_db(results), min_mw,
                                max_mw)
        logger.debug("Found %d matches to %s.", len(filtered), query)
        if not is_valid_num_results(len(filtered), query, layout):
            return
        final = sorted(filtered, key=lambda x: x["MW"])
        logger.debug("Num mols: %d", len(final))
        column_names = ["", "Name", "MW", "Structure"]
        sheet = ipysheet.sheet(
            rows=len(final),
            columns=len(column_names),
            column_headers=column_names,
            column_resizing=False,
            column_width=[1, 4, 2, 10],
        )
        buttons = [
            widgets.Button(description="use",
                           layout=widgets.Layout(width="100%")) for x in final
        ]
        for button in buttons:
            button.on_click(
                lambda current: on_use_button_clicked(current, final, layout))
        ipysheet.column(0, buttons)
        ipysheet.column(1, [x["name"] for x in final])
        ipysheet.column(2, [ExactMolWt(x["mol"]) for x in final])
        ipysheet.column(3, [cheminfo.mol_to_image(x["mol"]) for x in final])
        layout.children = swap_layout(layout.children,
                                      LayoutPosition.SEARCH_OUTPUT.value,
                                      sheet)
Exemplo n.º 12
0
    def analyze(self, smiles: List[str], only_drugs=True) -> pd.DataFrame:
        features = self.preprocessor.transform(smiles)

        # RDKit molecular properties
        inchikey = []
        weight = []
        logp = []
        hdonors = []
        hacceptors = []
        for example in smiles:
            mol = MolFromSmiles(example)
            if not mol:
                raise ValueError("Malformed molecule passed in to analyze")

            inchikey.append(MolToInchiKey(mol))
            weight.append(ExactMolWt(mol))
            logp.append(MolLogP(mol))
            hdonors.append(NumHDonors(mol))
            hacceptors.append(NumHAcceptors(mol))

        # Scores
        safety = self.safety.predict(features)
        feasibility = self.feasibility.predict(features)
        bbbp = self.bbbp.predict_proba(features)

        dataframe = pd.DataFrame(
            {
                "key": inchikey,
                "smiles": smiles,
                "weight": weight,
                "logp": logp,
                "hdonors": hdonors,
                "hacceptors": hacceptors,
                "safety": safety,
                "feasibility": feasibility,
                "bbbp": (i[1] for i in bbbp),
            }
        )

        if only_drugs:
            # Lipinsky's rules
            dataframe = dataframe[dataframe.weight < 500]
            dataframe = dataframe[dataframe.hdonors <= 5]
            dataframe = dataframe[dataframe.hacceptors <= 10]
            dataframe = dataframe[dataframe.logp <= 5]

            # Filter too toxic and infeasible compounds
            dataframe = dataframe[dataframe.safety > 0.75]
            dataframe = dataframe[dataframe.feasibility > 0.75]

            dataframe = dataframe.reset_index(drop=True)

        return dataframe
Exemplo n.º 13
0
def make_mass_spectra(smiles_list):
    molecules = [MolFromSmiles(smiles) for smiles in smiles_list]
    weights = [ExactMolWt(mol) for mol in molecules]
    highest_mass = max(weights)
    least_mass = min(weights)
    # make a bar graph of the masses simulated by MOD.
    plt.hist(weights, bins=range(500))
    plt.xlabel("Exact Mass")
    plt.ylabel("Frequency")
    plt.title(
        "Mass spectra of the molecules simulated in the reaction network.")
    plt.show()
Exemplo n.º 14
0
    def __init__(self, configuration: StatsExtractionConfig):
        self._filters = FilterTypesEnum

        self._columns = DataframeColumnsEnum
        self._stats = StatsExtractionEnum
        self._purging = PurgingEnum
        self._configuration = configuration
        standardisation_config_dict = self._configuration.standardisation_config
        standardisation_config = [
            FilterConfiguration(name=name, parameters=params)
            for name, params in standardisation_config_dict.items()
        ]

        dec_separator = self._stats.DECORATION_SEPARATOR_TOKEN
        attachment_token = self._stats.ATTACHMENT_POINT_TOKEN
        self._mol_wts_udf = psf.udf(
            lambda x: ExactMolWt(Chem.MolFromSmiles(x)), pst.FloatType())
        self._num_rings_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._num_atoms_udf = psf.udf(
            lambda x: Chem.MolFromSmiles(x).GetNumHeavyAtoms(),
            pst.IntegerType())
        self._num_aromatic_rings_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumAromaticRings(
                Chem.MolFromSmiles(x)), pst.IntegerType())
        self._hbond_donors_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumHBD(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._hbond_acceptors_udf = psf.udf(
            lambda x: rdMolDescriptors.CalcNumHBA(Chem.MolFromSmiles(x)),
            pst.IntegerType())
        self._hetero_atom_ratio_udf = psf.udf(
            lambda x: len([
                atom for atom in Chem.MolFromSmiles(x).GetAtoms()
                if atom.GetAtomicNum() == 6
            ]) / Chem.MolFromSmiles(x).GetNumHeavyAtoms(), pst.FloatType())
        self._make_canonical_udf = psf.udf(
            lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)),
            pst.StringType())
        self._standardise_smiles_udf = psf.udf(
            lambda x: RDKitStandardizer(standardisation_config, None).
            apply_filter(x), pst.StringType())
        pattern = self._stats.REGEX_TOKENS
        self.regex = re.compile(pattern)
        self._tokeniser_udf = psf.udf(self.regex.findall,
                                      pst.ArrayType(pst.StringType()))
        self._decoration_split_udf = psf.udf(lambda x: x.split(dec_separator),
                                             pst.ArrayType(pst.StringType()))
        self._count_decorations_udf = psf.udf(
            lambda s: list(s).count(attachment_token), pst.IntegerType())
Exemplo n.º 15
0
def sdf_text_worker(merged_results, vendors, num_mols, start_time, mol_counter,
                    fragment_counter, drug_like_counter, big_counter,
                    parent_fragment_collector, parent_drug_like_collector,
                    parent_big_collector, failures, addhs, embed, verbose):
    if not verbose:
        RDLogger.DisableLog('rdApp.*')
    fragment_collector, drug_like_collector, big_collector = [], [], []
    for index, row in merged_results.iterrows():
        try:
            mol = Chem.MolFromSmiles(row['smiles'])
            if addhs:
                mol = Chem.AddHs(mol)
            if embed:
                AllChem.EmbedMolecule(mol)
            properties = {vendor: row[vendor] for vendor in vendors}
            mol_name = ','.join([
                identifier for identifier in properties.values()
                if len(identifier) > 0
            ])
            if len(mol_name) > 20:
                mol_name = mol_name[:17] + '...'
            mol.SetProp('_Name', mol_name)
            properties['smiles'] = row['smiles']
            molecular_weight = ExactMolWt(mol)
        except:
            failures.append(' '.join(['write_error', row['smiles']]))
            molecular_weight = 10000
        if molecular_weight < 1200:
            if molecular_weight < 300:
                with fragment_counter.get_lock():
                    fragment_counter.value += 1
                fragment_collector.append(sdf_text(mol, properties))
            elif 300 <= molecular_weight < 700:
                with drug_like_counter.get_lock():
                    drug_like_counter.value += 1
                drug_like_collector.append(sdf_text(mol, properties))
            else:
                with big_counter.get_lock():
                    big_counter.value += 1
                big_collector.append(sdf_text(mol, properties))
        with mol_counter.get_lock():
            mol_counter.value += 1
            update_progress(mol_counter.value / num_mols,
                            'Progress of writing',
                            ((time.time() - start_time) / mol_counter.value) *
                            (num_mols - mol_counter.value))
    parent_fragment_collector.extend(fragment_collector)
    parent_drug_like_collector.extend(drug_like_collector)
    parent_big_collector.extend(big_collector)
    return
Exemplo n.º 16
0
def mol_remover(smile, mol):
    remove = False
    reason = 0
    if len(smile) == 0:
        reason = "No smile this line, removed"
        remove = True
    #mol = MolFromSmiles(smile)
    if ExactMolWt(mol) > 700:
        reason = "Molecule too heavy, removed"
        remove = True
    """
    remover = SaltRemover(defnData = "[Cl]")
    res = remover(mol)
    if res is not None:
        reason = "Include salt"
        remove = True
    """
    return remove, reason
Exemplo n.º 17
0
def calc_properties(smi):
    # returns logP, TPSA, MW, MR
    # normalize quantities
    m = Chem.MolFromSmiles(smi.numpy())
    logP = np.asarray(MolLogP(m))
    logP = (logP - LOGP_MEAN) / LOGP_STD

    tpsa = np.asarray(CalcTPSA(m))
    tpsa = np.log10(tpsa + 1)
    tpsa = (tpsa - TPSA_MEAN) / TPSA_STD

    # sas = calculateScore(m)

    mw = np.asarray(ExactMolWt(m))
    mw = np.log10(mw + 1)
    mw = (mw - MW_MEAN) / MW_STD

    mr = np.asarray(MolMR(m))
    mr = np.log10(mr + 1)
    mr = (mr - MR_MEAN) / MR_STD
    return logP, tpsa, mw, mr
Exemplo n.º 18
0
def get_target_data(data, target_id, act_type='IC50'):
    """Returns a data frame of all the ligands for a given target
       Also makes sure that all the smiles are valid, and
       filters by weight."""
    if act_type is not None:
        data = data[data.act_type == act_type]
    target_data = data[data.target_id == target_id]
    # Filter by molecules that can be converted by rdkit
    n_ligs = target_data.shape[0]
    mols = np.zeros(n_ligs, dtype=object)
    for i in range(n_ligs):
        try:
            mols[i] = MolFromSmiles(target_data.smiles.iloc[i])
        except:
            mols[i] = None
    mols = pd.Series(mols)
    target_data = target_data[[not m for m in mols.isna()]]
    # Filter by weight
    weights = target_data.smiles.apply(lambda x: ExactMolWt(MolFromSmiles(x)))
    target_data = target_data[(weights >= 100) & (weights <= 600)]
    return target_data
Exemplo n.º 19
0
def load_mol(mol, tag):
    smiles = MolToSmiles(mol)

    Chemical = apps.get_model('cspace.Chemical')
    if Chemical.objects.filter(smiles=smiles).count():
        chem = Chemical.objects.get(smiles=smiles)
        chem.tags.add(tag)

        return -1

    props = get_mol_props_dict(mol)
    chem = Chemical(smiles=smiles,
                    mol_weight=ExactMolWt(mol),
                    chem_name=props.get('PUBCHEM_IUPAC_NAME', 'MISSING_NAME'),
                    pubchem_compound_cid=props.get('PUBCHEM_COMPOUND_CID',
                                                   'MISSING_ID'),
                    props_json=json.dumps(props),
                    tpsa=TPSA(mol))

    chem.save()
    chem.tags.add(tag)
    return 1
Exemplo n.º 20
0
    def __init__(self, smiles):
        self.smiles = smiles

        self.possible_bonds = possible_bonds
        self.table_of_elements = table_of_elements
        self.vocab_nodes_encode = vocab_nodes_encode
        self.mol = Chem.MolFromSmiles(smiles)

        self.adj = self._get_adj_mat(smiles)
        self.node_list = self._get_node_list(smiles)
        self.num_atom = len(self.node_list)
        self.expand_mat = self._get_expand_mat(self.adj, self.node_list)
        self.life_time = 0
        self.pool_life_time = 0
        self.similarity = -1

        self.property = {
            'qed': qed(self.mol),
            'J_score': calc_score(self.mol),
            'MW' : ExactMolWt(self.mol)
        }
        self.prior_flag = False
Exemplo n.º 21
0
def calc_properties(smi):
    """
    :param smi:
    :return: logP, TPSA, MR, MW
    """
    m = Chem.MolFromSmiles(smi.numpy())
    logP = np.asarray(MolLogP(m))
    logP = (logP - LOGP_MEAN) / LOGP_STD

    tpsa = np.asarray(CalcTPSA(m))
    tpsa = np.log10(tpsa + 1)
    tpsa = (tpsa - TPSA_MEAN) / TPSA_STD

    # sas = calculateScore(m)

    mw = np.asarray(ExactMolWt(m))
    mw = np.log10(mw + 1)
    mw = (mw - MW_MEAN) / MW_STD

    mr = np.asarray(MolMR(m))
    mr = np.log10(mr + 1)
    mr = (mr - MR_MEAN) / MR_STD
    return logP, tpsa, mr, mw
Exemplo n.º 22
0
def main():

    enamine_df = pd.read_csv('Enamine_submissions.csv',
                             usecols=['SMILES', 'CID'])
    SA_df = pd.read_csv('covid_SA_file_new.csv',
                        usecols=['SMILES', 'MW', 'CID'])
    #SA_df = pd.read_csv('covid_SA_file.csv', usecols=['SMILES', 'MW', 'CID'])
    score_df = pd.read_csv('score_data.csv',
                           usecols=['SMILES', 'TITLE', 'Chemgauss4 Score'])

    SA_df['SMILES'] = SA_df['SMILES'].apply(
        lambda x: MolToSmiles(MolFromSmiles(x)))
    enamine_df['SMILES'] = enamine_df['SMILES'].apply(
        lambda x: MolToSmiles(MolFromSmiles(x)))
    enamine_df['MW'] = enamine_df['SMILES'].apply(
        lambda x: ExactMolWt(MolFromSmiles(x)))
    score_df['SMILES'] = score_df['SMILES'].apply(
        lambda x: MolToSmiles(MolFromSmiles(x)))
    score_df = score_df.rename(columns={'TITLE': 'CID'})

    score_df = score_df[score_df['Chemgauss4 Score'] < -6]
    SA_df = SA_df[SA_df['MW'] > 250]
    enamine_df = enamine_df[enamine_df['MW'] > 250]

    #SA_df.to_csv('covid_SA_file.csv', index=False)
    #enamine_df.to_csv('Enamine_submissions.csv', index=False)
    #score_df.to_csv('score_data.csv', index=False)

    SA_df['source'] = 'covid_SA'
    enamine_df['source'] = 'enamine'

    print('Post-filtering:')
    print(SA_df.describe())
    print(enamine_df.describe())

    merged_synth_df = pd.merge(SA_df, enamine_df, how='outer')
    print(merged_synth_df[merged_synth_df.duplicated(subset='SMILES',
                                                     keep=False)])
    # merged_synth_df = SA_df

    score_synth_df = pd.merge(merged_synth_df.drop_duplicates(subset='CID'),
                              score_df.drop_duplicates(subset='CID'),
                              how='inner',
                              on='CID')
    score_synth_df = score_synth_df.rename(columns={
        'SMILES_y': 'SMILES',
        'Chemgauss4 Score': 'dock_score'
    })
    score_synth_df = score_synth_df.drop_duplicates(subset='SMILES')

    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #    print(score_synth_df)
    merged_synth_df[~merged_synth_df['CID'].isin(score_synth_df['CID']
                                                 )].to_csv('missing.csv',
                                                           index=False)

    final_df = score_synth_df[['SMILES', 'source', 'CID', 'MW',
                               'dock_score']].sort_values(by='dock_score')
    print(final_df['source'].value_counts())
    final_df.to_csv('processed_data.csv', index=False)

    score_synth_df = pd.merge(enamine_df.drop_duplicates(subset='CID'),
                              score_df.drop_duplicates(subset='CID'),
                              how='inner',
                              on='CID')
    score_synth_df = score_synth_df.rename(columns={
        'SMILES_y': 'SMILES',
        'Chemgauss4 Score': 'dock_score'
    })
    score_synth_df = score_synth_df.drop_duplicates(subset='SMILES')
    #with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #    print(score_synth_df)
    final_df_2 = score_synth_df[[
        'SMILES', 'source', 'CID', 'MW', 'dock_score'
    ]].sort_values(by='dock_score')
    print(final_df_2['source'].value_counts())
    final_df_2.to_csv('processed_data_enamine.csv', index=False)
Exemplo n.º 23
0
def cal_prop(s):
    m = Chem.MolFromSmiles(s)
    if m is None : return None
    return Chem.MolToSmiles(m), ExactMolWt(m), MolLogP(m), CalcNumHBD(m), CalcNumHBA(m), CalcTPSA(m)
Exemplo n.º 24
0
 def feature(self, mol_string):
     from rdkit.Chem.Descriptors import ExactMolWt
     feature = np.array([ExactMolWt(Chem.MolFromSmiles(mol_string))],
                        np.float32)
     return feature
Exemplo n.º 25
0
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt

with open('../id_smiles.txt') as f, open('data.txt', 'w') as w:
    for l in f:
        m_id, s1, s2 = l.split()
        m1, m2 = Chem.MolFromSmiles(s1), Chem.MolFromSmiles(s2)
        if m1 is None or m2 is None: continue
        c1, c2 = ExactMolWt(m1), ExactMolWt(m2)
        w.write(m_id + '\t' + str(c1) + '\t' + str(c2) + '\n')
Exemplo n.º 26
0
def lipinski_filter(smiles):
    mol = MolFromSmiles(smiles)
    return MolLogP(mol) <= 5 and NumHAcceptors(mol) <= 10 and NumHDonors(mol) <= 5 and 100 <= ExactMolWt(mol) <= 500
Exemplo n.º 27
0
_, _, char, vocab, _, _ = load_data(args.prop_file, args.seq_length)
vocab_size = len(char)

model = GNMTP(vocab_size,
             args
             )
model.restore(args.save_file)

target_prop = np.array([[float(p) for p in args.target_prop.split()] for _ in range(args.batch_size)])
start_codon = np.array([np.array(list(map(vocab.get, 'X')))for _ in range(args.batch_size)])

smiles = []
for _ in range(args.num_iteration):
    latent_vector = s = np.random.normal(args.mean, args.stddev, (args.batch_size, args.latent_size))
    generated = model.sample(latent_vector, target_prop, start_codon, args.seq_length)
    smiles += [convert_to_smiles(generated[i], char) for i in range(len(generated))]

print ('number of trial : ', len(smiles))
smiles = list(set([s.split('E')[0] for s in smiles]    ))
print ('number of generated smiles : ', len(smiles))
ms = [Chem.MolFromSmiles(s) for s in smiles]
ms = [m for m in ms if m is not None]
print ('number of valid smiles : ', len(ms))
with open(args.result_filename, 'w') as w:
    w.write('smiles\t MW\t LogP\t TPSA\n')
    for m in ms:
        try:
            w.write('%s\t%.3f\t%.3f\t%.3f\n' %(Chem.MolToSmiles(m), ExactMolWt(m), MolLogP(m), CalcTPSA(m)))
        except:
            continue            
Exemplo n.º 28
0
def update_dataset(
        gen_evaluated,
        data_file,
        target='Target',
        threshold=0.8,  # Threshold improvement required to be kept (based on optimization target)
        screen_file1=None,
        selection_type1=None,
        selection_thresh1=None,
        keep1=None,
        min_mol_wt=50,  #(g/mol)
        pairing_method='bemis_murcko',
        n_clusters=None,
        tan_threshold=None):

    paired = pd.read_csv(gen_evaluated)

    # Remove molecules that do not follow screen 1:
    if screen_file1 is not None:
        screen1 = pd.read_csv(screen_file1)
        paired['Screen2_1'] = paired['Mol2'].apply(lambda x: labeled[labeled[
            labeled.columns[0]] == x][labeled.columns[1]].iloc[0])
        paired = apply_screen(paired, 'Screen2_1', selection_type1,
                              selection_thresh1, keep1)

    # Remove Y molecules with low mw:
    paired['MolWt2'] = paired['Mol2'].apply(
        lambda x: ExactMolWt(Chem.MolFromSmiles(x)))
    paired = paired[paired['MolWt2'] > min_mol_wt]

    # Remove molecules outside scaffold
    if pairing_method == 'bemis_murcko':
        paired['Scaffold1'] = paired['Mol1'].apply(generate_scaffold)
        paired['Scaffold2'] = paired['Mol2'].apply(generate_scaffold)
        paired = paired[paired['Scaffold1'] == paired['Scaffold2']]
    elif pairing_method == 'tanimoto':
        mol_list = pd.concat(
            (paired[['Mol1']].rename(columns={'Mol1': 'SMILES'}),
             paired[['Mol2'
                     ]].rename(columns={'Mol2': 'SMILES'}))).drop_duplicates()
        adj = tan_adjacency(pd.DataFrame(mol_list))
        labels = adjacency_clusters(adj, n_clusters, threshold)

        mol_list['cluster'] = labels
        paired['Scaffold1'] = paired['Mol1'].apply(
            lambda x: mol_list[mol_list['SMILES'] == x]['cluster'].values[0])
        paired['Scaffold2'] = paired['Mol2'].apply(
            lambda x: mol_list[mol_list['SMILES'] == x]['cluster'].values[0])
        paired = paired[paired['Scaffold1'] == paired['Scaffold2']]
    else:
        raise Exception('Unsupported pairing option:', pairing_method)

    # Make labeled dataset for input into next iteration:
    x_labeled = paired[['Mol1', 'Target1']].rename(columns={
        'Mol1': 'SMILES',
        'Target1': target
    })
    y_labeled = paired[['Mol2', 'Target2']].rename(columns={
        'Mol2': 'SMILES',
        'Target2': target
    })

    data_out = pd.concat([x_labeled,
                          y_labeled]).drop_duplicates(subset=['SMILES'],
                                                      keep='last')

    # Save data.csv file
    data_out.to_csv(data_file, index=False)
Exemplo n.º 29
0
def standardize_mols(jobs, mol_counter, num_mols, results, start_time, vendors, max_stereo_isomers, failures,
                     tautomer, verbose):
    """
    This function passes molecules to the standardization functions.

    Parameters
    ----------
    jobs: multiprocessing.manager.list
        A list containing job information as dictionaries.

    mol_counter: multiprocessing.manager.value
        A counter keeping track of processed molecules.

    num_mols: int
        Total number of molecules to be processed.

    results: multiprocessing.manager.list
        A list containing lists describing the processed molecules.

    start_time: float
        Starting time of molecule processing.

    vendors: list
        List of vendors.

    max_stereo_isomers: int
        Maximal number of stereo isomers to generater per molecule.

    verbose : bool
        If RDKit warning should be displayed.

    """
    if not verbose:
        RDLogger.DisableLog('rdApp.*')
    job = 'initiate'
    processed_mols = []
    while job is not None:
        try:
            job = jobs.pop(0)
            vendor_position = vendors.index(job['vendor'])
            supplier = Chem.SDMolSupplier(job['sdf_path'])
            for mol_id in range(job['mol_start'], job['mol_end'] + 1):
                mol = supplier[mol_id]
                if job['identifier_field'] == 'None':
                    identifier = 'unknown'
                else:
                    try:
                        identifier = mol.GetProp(job['identifier_field'])
                    except AttributeError:
                        identifier = 'unknown'
                try:
                    # generate smiles for error catching
                    smiles = 'unknown'
                    smiles = Chem.MolToSmiles(mol)
                    # default standardization from molvs
                    mol = Standardizer().standardize(mol)
                    # choose largest fragment
                    mol = LargestFragmentChooser().choose(mol)
                    # canonicalize tautomer
                    if tautomer:
                        mol = TautomerCanonicalizer().canonicalize(mol)
                    # protonate mol
                    mol = protonate_mol(mol)
                    # molecular weight will not change anymore
                    if ExactMolWt(mol) < 1200:
                        # enumerate stereo isomers and append mols
                        if max_stereo_isomers > 0:
                            for mol in enumerate_stereo_isomers(mol, max_stereo_isomers):
                                mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors)
                                mol_as_list[1 + vendor_position] = identifier
                                processed_mols.append(mol_as_list)
                        else:
                            mol_as_list = [Chem.MolToSmiles(mol)] + [''] * len(vendors)
                            mol_as_list[1 + vendor_position] = identifier
                            processed_mols.append(mol_as_list)
                except:
                    failures.append(' '.join(['standardize_error', smiles, job['vendor'], identifier]))
                with mol_counter.get_lock():
                    mol_counter.value += 1
                update_progress(mol_counter.value / num_mols, 'Progress of standardization',
                                ((time.time() - start_time) / mol_counter.value) * (num_mols - mol_counter.value))
        except IndexError:
            job = None
    results += processed_mols
    return
Exemplo n.º 30
0
    def calculate(self):
        w = ExactMolWt(self.mol) if self._exact else MolWt(self.mol)
        if self._averaged:
            w /= self.mol.GetNumAtoms()

        return w