Пример #1
0
    def use(self, smiles: list, backend: str = 'padel'):
        ''' use: uses the trained project to predict values for supplied
        molecules

        Args:
            smiles (list): list of SMILES strings to predict for
            backend (str): backend software to use for QSPR generation; `padel`
                or `alvadesc`; default = `padel`; alvadesc requries valid
                license

        Returns:
            numpy.array: predicted values
        '''

        if backend == 'alvadesc':
            mols = [smiles_to_descriptors(s) for s in smiles]
            for mol in mols:
                for key in list(mol.keys()):
                    if mol[key] == 'na':
                        mol[key] = 0
        elif backend == 'padel':
            mols = [from_smiles(s) for s in smiles]
        else:
            raise ValueError('Unknown backend software: {}'.format(backend))
        return mean([
            model.use(
                asarray([[float(mol[name]) for name in self._df._input_names]
                         for mol in mols])) for model in self._models
        ],
                    axis=0)
Пример #2
0
def _qspr_from_alvadesc(smiles: List[str]) -> Tuple[List[List[float]], List[str]]:
    """
    Args:
        smiles (list[str]): list of SMILES strings

    Returns:
        Tuple[List[List[float]], List[str]]: (descriptors w/ shape (n_compounds, n_desc),
            descriptor names)
    """

    desc = smiles_to_descriptors(smiles)
    keys = list(desc[0].keys())
    for idx, d in enumerate(desc):
        for k in keys:
            if d[k] == 'na':
                desc[idx][k] = 0.0
    desc = [[float(d[k]) for k in keys] for d in desc]
    return (desc, keys)
Пример #3
0
def create_db(smiles: list,
              db_name: str,
              targets: list = None,
              id_prefix: str = '',
              extra_strings: dict = {},
              backend: str = 'padel',
              convert_mdl: bool = False):
    ''' create_db: creates an ECNet-formatted database from SMILES strings
    using either PaDEL-Descriptor or alvaDesc software; using alvaDesc
    requires a valid installation/license of alvaDesc

    Args:
        smiles (list): list of SMILES strings
        db_name (str): name/path of database being created
        targets (list): target (experimental) values, align with SMILES
            strings; if None, all TARGETs set to 0
        id_prefix (str): prefix of molecule DATAID, if desired
        extra_strings (dict): extra STRING columns, label = name, value = list
            with length equal to number of SMILES strings
        backend (str): software used to calculate QSPR descriptors, 'padel' or
            'alvadesc'
        convert_mdl (bool): if `True`, converts SMILES strings to MDL 3D
            format before calculating descriptors (PaDEL only)
    '''

    if targets is not None:
        if len(targets) != len(smiles):
            raise ValueError('Must supply same number of targets as SMILES '
                             'strings: {}, {}'.format(len(targets),
                                                      len(smiles)))

    for string in list(extra_strings.keys()):
        if len(extra_strings[string]) != len(smiles):
            raise ValueError('Extra string values for {} not equal in length '
                             'to supplied SMILES: {}, {}'.format(
                                 len(extra_strings[string]), len(smiles)))

    mols = []
    if backend == 'alvadesc':
        for mol in smiles:
            mols.append(smiles_to_descriptors(mol))
    elif backend == 'padel':
        for idx, mol in enumerate(smiles):
            if convert_mdl is True:
                if pybel is None:
                    raise ImportError(
                        'pybel (Python Open Babel wrapper) not installed, '
                        'cannot convert SMILES to MDL')
                mdl = pybel.readstring('smi', mol)
                mdl.make3D()
                curr_time = datetime.now().strftime('%Y%m%d%H%M%S%f')[:-3]
                mdl.write('mdl', '{}.mdl'.format(curr_time))
                try:
                    mols.append(from_mdl('{}.mdl'.format(curr_time))[0])
                except RuntimeError:
                    warn(
                        'Could not calculate descriptors for {}, omitting'.
                        format(mol), RuntimeWarning)
                    del smiles[idx]
                    if targets is not None:
                        del targets[idx]
                    for string in list(extra_strings.keys()):
                        del extra_strings[string][idx]
                remove('{}.mdl'.format(curr_time))
            else:
                try:
                    mols.append(from_smiles(mol))
                except RuntimeError:
                    warn(
                        'Could not calculate descriptors for {}, omitting'.
                        format(mol), RuntimeWarning)
                    del smiles[idx]
                    if targets is not None:
                        del targets[idx]
                    for string in list(extra_strings.keys()):
                        del extra_strings[string][idx]
    else:
        raise ValueError('Unknown backend software: {}'.format(backend))

    rows = []
    type_row = ['DATAID', 'ASSIGNMENT', 'STRING', 'STRING']
    title_row = ['DATAID', 'ASSIGNMENT', 'Compound Name', 'SMILES']
    strings = list(extra_strings.keys())
    for string in strings:
        if string != 'Compound Name':
            type_row.append('STRING')
            title_row.append(string)
    type_row.append('TARGET')
    title_row.append('TARGET')
    descriptor_keys = list(mols[0].keys())
    for key in descriptor_keys:
        type_row.append('INPUT')
        title_row.append(key)

    mol_rows = []
    for idx, desc in enumerate(mols):
        for key in descriptor_keys:
            if desc[key] == 'na' or desc[key] == '':
                desc[key] = 0
        mol = _Molecule('{}'.format(id_prefix) + '%04d' % (idx + 1))
        for string in strings:
            mol.strings[string] = extra_strings[string][idx]
        if targets is not None:
            mol.target = targets[idx]
        mol.inputs = desc
        mol_rows.append(mol)

    with open(db_name, 'w', encoding='utf-8') as db_file:
        wr = writer(db_file, delimiter=',', lineterminator='\n')
        wr.writerow(type_row)
        wr.writerow(title_row)
        for idx, mol in enumerate(mol_rows):
            row = [
                mol.id, mol.assignment, mol.strings['Compound Name'],
                smiles[idx]
            ]
            for string in strings:
                if string != 'Compound Name':
                    row.append(mol.strings[string])
            row.append(mol.target)
            for key in descriptor_keys:
                row.append(mol.inputs[key])
            wr.writerow(row)
    db_file.close()
Пример #4
0
from alvadescpy import smiles_to_descriptors

if __name__ == '__main__':

    print(
        smiles_to_descriptors('CCCOC', descriptors=['MW', 'AMW'], labels=True))
    res = smiles_to_descriptors(['CCCC', 'CCOCC', 'CCCCC'],
                                descriptors='ALL',
                                labels=True)
    print(len(res))
    print(len(res[0]))
Пример #5
0
from alvadescpy import smiles_to_descriptors


if __name__ == '__main__':

    print(smiles_to_descriptors('CCC', descriptors=['MW', 'AMW'], labels=True))