Пример #1
0
    def setUp(self):
        """
        Set up tests.
        """
        self.reader = serial.MolReader()

        # generate molecules
        smiles = [
            'CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O',
            'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F'
        ]
        names = ['aspirin', 'ibuprofen', 'celecoxib']
        self.mols = []
        for s, n in zip(smiles, names):
            mol = Chem.MolFromSmiles(s)
            mol.SetProp('_Name', n)
            AllChem.Compute2DCoords(mol)
            self.mols.append(mol)

        # write molecules to file
        self.temp_dir = tempfile.mkdtemp()
        writer = serial.MolWriter()
        _, self.filename = tempfile.mkstemp(dir=self.temp_dir,
                                            suffix='.sdf.gz')
        with writer.open(self.filename) as w:
            w.write(self.mols)

        self.sharder = DatasetSharder(filename=self.filename,
                                      write_shards=False)
        self.reader = serial.MolReader()
Пример #2
0
    def test_not_close_other(self):
        """
        Make sure MolIO doesn't close files it didn't open.
        """
        _, filename = tempfile.mkstemp(suffix='.sdf', dir=self.temp_dir)
        with open(filename) as f:
            reader = serial.MolReader(f, mol_format='sdf')
            reader.close()
            assert not f.closed

        # also test the context manager
        with open(filename) as g:
            with serial.MolReader(g, mol_format='sdf'):
                pass
            assert not g.closed
Пример #3
0
 def read_data(self):
     """
     Read labeled molecules.
     """
     with serial.MolReader().open(self.filename) as reader:
         mols = list(reader)
     return mols
Пример #4
0
def read_mols(input_filename, mol_id_prefix=None):
    """
    Read molecules from an input file and extract names.

    Parameters
    ----------
    input_filename : str
        Filename containing molecules.
    """
    print "Reading molecules..."
    mols = []
    names = []
    with serial.MolReader().open(input_filename) as reader:
        for mol in reader.get_mols():
            mols.append(mol)
            if mol.HasProp('_Name'):
                name = mol.GetProp('_Name')
                if mol_id_prefix is not None:
                    name = mol_id_prefix + name
                names.append(name)
            else:
                names.append(None)
    mols = np.asarray(mols)
    names = np.asarray(names)
    return mols, names
Пример #5
0
def read_mols(input_filename, mol_id_prefix=None, log_every_N=1000):
    """
    Read molecules from an input file and extract names.

    Parameters
    ----------
    input_filename : str
      Filename containing molecules.
    log_every_N: int
      Print log statement every N molecules read.
    """
    print "Reading molecules..."
    mols = []
    names = []
    with serial.MolReader().open(input_filename) as reader:
        for num, mol in enumerate(reader.get_mols()):
            if num % 1000 == 0:
                print "Reading molecule %d" % num
            mols.append(mol)
            if mol.HasProp('_Name'):
                name = mol.GetProp('_Name')
                if mol_id_prefix is not None:
                    name = mol_id_prefix + name
                names.append(name)
            else:
                names.append(None)
    mols = np.asarray(mols)
    names = np.asarray(names)
    print "%d molecules read." % len(mols)
    return mols, names
Пример #6
0
 def test_skip_failures(self):
     """
     Test skip read failures.
     """
     smiles = 'CO(C)C'
     reader = serial.MolReader(StringIO(smiles), 'smi')
     mols = list(reader.get_mols())
     assert len(mols) == 0
Пример #7
0
 def test_is_a_salt(self):
     """
     Test that a molecule that _is_ a salt is not returned empty.
     """
     smiles = 'C(=CC(=O)O)C(=O)O'
     reader = serial.MolReader(StringIO(smiles), 'smi', remove_salts=True)
     mols = list(reader.get_mols())
     assert len(mols) == 1 and mols[0].GetNumAtoms()
Пример #8
0
 def test_no_remove_salts(self):
     """
     Test salt retention.
     """
     _, filename = tempfile.mkstemp(suffix='.sdf', dir=self.temp_dir)
     with open(filename, 'wb') as f:
         for mol in [self.aspirin_sodium, self.levalbuterol_hcl]:
             f.write(Chem.MolToMolBlock(mol))
             f.write('$$$$\n')  # molecule delimiter
     ref_mols = [self.aspirin_sodium, self.levalbuterol_hcl]
     self.reader = serial.MolReader(remove_salts=False)
     self.reader.open(filename)
     mols = self.reader.get_mols()
     mols = list(mols)
     assert len(mols) == 2
     self.reader = serial.MolReader(remove_salts=True)
     for mol, ref_mol in zip(mols, ref_mols):
         assert mol.ToBinary() == ref_mol.ToBinary()
         desalted = self.reader.clean_mol(ref_mol)
         assert mol.GetNumAtoms() > desalted.GetNumAtoms()
Пример #9
0
 def test_remove_hydrogens(self):
     """
     Test hydrogen removal.
     """
     _, filename = tempfile.mkstemp(suffix='.sdf', dir=self.temp_dir)
     with open(filename, 'wb') as f:
         f.write(Chem.MolToMolBlock(self.aspirin_h))
     reader = serial.MolReader(remove_hydrogens=True)
     reader.open(filename)
     mols = reader.get_mols()
     assert mols.next().ToBinary() == self.aspirin.ToBinary()
Пример #10
0
 def test_read_compressed_file_like(self):
     """
     Read from a file-like object using gzip.
     """
     _, filename = tempfile.mkstemp(suffix='.sdf.gz', dir=self.temp_dir)
     with gzip.open(filename, 'wb') as f:
         f.write(Chem.MolToMolBlock(self.aspirin))
     with gzip.open(filename) as f:
         reader = serial.MolReader(f, mol_format='sdf')
         mols = reader.get_mols()
         assert mols.next().ToBinary() == self.aspirin.ToBinary()
Пример #11
0
 def test_no_remove_hydrogens(self):
     """
     Test hydrogen retention.
     """
     _, filename = tempfile.mkstemp(suffix='.sdf', dir=self.temp_dir)
     with open(filename, 'wb') as f:
         f.write(Chem.MolToMolBlock(self.aspirin_h))
     reader = serial.MolReader(remove_hydrogens=False, remove_salts=False)
     reader.open(filename)
     mols = reader.get_mols()
     # FIXME get ToBinary test to work
     # assert mols.next().ToBinary() == self.aspirin_h.ToBinary()
     assert Chem.MolToMolBlock(mols.next()) == Chem.MolToMolBlock(
         self.aspirin_h)
Пример #12
0
def get_smiles(filename, assign_stereo_from_3d=False):
    """
    Get SMILES for molecules.

    Parameters
    ----------
    filename : str
        Input molecule filename.
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    database = MoleculeDatabase(assign_stereo_from_3d=assign_stereo_from_3d)
    with serial.MolReader().open(filename) as reader:
        for mol in reader:
            database.add_mol(mol)
    return list(database.smiles)
Пример #13
0
def main(input_filenames,
         output_filename,
         id_prefix=None,
         allow_duplicates=True,
         update=False,
         assign_stereo_from_3d=False):
    """
    Get SMILES for compounds and map to compound names.

    Parameters
    ----------
    input_filenames : list
        Input molecule filenames.
    output_filename : str
        Output filename.
    id_prefix : str, optional
        Prefix to prepend to IDs.
    allow_duplicates : bool, optional (default True)
        Allow duplicate SMILES.
    update : bool, optional (default False)
        Update an existing map with the same output filename. If False, a new
        map will be generated using only the input file(s).
    assign_stereo_from_3d : bool, optional (default False)
        Assign stereochemistry from 3D coordinates.
    """
    smiles = SmilesMap(prefix=id_prefix,
                       allow_duplicates=allow_duplicates,
                       assign_stereo_from_3d=assign_stereo_from_3d)

    # update existing map
    if update:
        smiles.map = read_pickle(output_filename)

    for input_filename in input_filenames:
        print input_filename
        with serial.MolReader().open(input_filename) as reader:
            for mol in reader:
                try:
                    smiles.add_mol(mol)
                except ValueError:
                    if mol.HasProp('_Name'):
                        print 'Skipping {}'.format(mol.GetProp('_Name'))
                    else:
                        print 'Skipping {}'.format(
                            Chem.MolToSmiles(mol, isomericSmiles=True))
    write_pickle(smiles.get_map(), output_filename)
Пример #14
0
    def _ionize_3d(self, mol):
        """
        Ionize a molecule while preserving conformers.

        Parameters
        ----------
        mol : RDMol
            Molecule.
        """
        assert mol.GetNumConformers() > 0
        sdf = ''
        for conf in mol.GetConformers():
            sdf += Chem.MolToMolBlock(mol,
                                      confId=conf.GetId(),
                                      includeStereo=True)
            sdf += '$$$$\n'
        args = ['obabel', '-i', 'sdf', '-o', 'sdf', '-p', str(self.pH)]
        p = subprocess.Popen(args,
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        ionized_sdf, _ = p.communicate(sdf)
        reader = serial.MolReader(StringIO(ionized_sdf),
                                  mol_format='sdf',
                                  remove_salts=False)  # no changes
        try:
            mols = list(reader.get_mols())
        except RuntimeError as e:  # catch pre-condition violations
            raise IonizerError(e.message)

        # catch ionizer failure
        if len(mols) == 0:
            raise IonizerError(mol)

        # detection of stereochemistry based on 3D coordinates might result
        # in issues when attempting to recombine ionized conformers, but we
        # merge them anyway
        if len(mols) == 1:
            ionized_mol, = mols
        else:
            ionized_mol = mols[0]
            for other in mols[1:]:
                for conf in other.GetConformers():
                    ionized_mol.AddConformer(conf, assignId=True)
        return ionized_mol
Пример #15
0
    def setUp(self):
        """
        Write SDF and SMILES molecules to temporary files.
        """
        self.temp_dir = tempfile.mkdtemp()

        # aspirin
        self.aspirin = self._get_mol_from_smiles('CC(=O)OC1=CC=CC=C1C(=O)O',
                                                 'aspirin')
        self.aspirin_h = Chem.AddHs(self.aspirin)
        self.aspirin_sodium = self._get_mol_from_smiles(
            'CC(=O)OC1=CC=CC=C1C(=O)[O-].[Na+]', 'aspirin sodium')

        # levalbuterol (chiral)
        self.levalbuterol = self._get_mol_from_smiles(
            'CC(C)(C)NC[C@@H](C1=CC(=C(C=C1)O)CO)O', 'levalbuterol')
        self.levalbuterol_hcl = self._get_mol_from_smiles(
            'CC(C)(C)NC[C@@H](C1=CC(=C(C=C1)O)CO)O.Cl',
            'levalbuterol hydrochloride')

        self.ref_mols = [self.aspirin, self.levalbuterol]
        self.reader = serial.MolReader(compute_2d_coords=False)
Пример #16
0
def main(input_filenames,
         output_filename,
         database_filename=None,
         assign_stereo_from_3d=False):
    """
    Update or create a molecule database.

    Parameters
    ----------
    input_filenames : list
        Input molecule filename(s).
    output_filename : str
        Output filename.
    database_filename : str, optional
        Existing database to update.
    assign_stereo_from_3d : bool, optional (default False)
        Whether to assign stereochemistry from 3D coordinates.
    """
    database = MoleculeDatabase(assign_stereo_from_3d=assign_stereo_from_3d)
    if database_filename is not None:
        database.load(database_filename)
    initial_size = len(database)
    for filename in input_filenames:
        print filename
        with serial.MolReader().open(filename) as reader:
            for mol in reader:
                try:
                    database.add_mol(mol)
                except ValueError:
                    if mol.HasProp('_Name'):
                        print 'Skipping {}'.format(mol.GetProp('_Name'))
                    else:
                        print 'Skipping {}'.format(
                            Chem.MolToSmiles(mol, isomericSmiles=True))
    final_size = len(database)
    print '{} molecules added to the database'.format(final_size -
                                                      initial_size)
    database.save(output_filename)
Пример #17
0
def main(active_filename,
         decoy_filename,
         assay_id,
         target,
         with_assay_id=True,
         with_target=True,
         phenotype=None,
         output_filename=None,
         mol_id_prefix=None,
         output_format='.pkl.gz'):
    rows = []
    for outcome, filename in zip(['active', 'inactive'],
                                 [active_filename, decoy_filename]):
        this_phenotype = phenotype
        if outcome == 'inactive' and phenotype is not None:
            this_phenotype = 'inactive'
        with serial.MolReader().open(filename) as reader:
            this_rows = get_rows(reader, outcome, this_phenotype,
                                 mol_id_prefix)
            rows.extend(this_rows)

    # create dataframe
    df = pd.DataFrame(rows)

    # sanity check for duplicate mol_ids
    assert len(np.unique(df['mol_id'])) == len(df)

    # add assay_id and target columns
    if with_assay_id:
        df.loc[:, 'assay_id'] = assay_id
    if with_target:
        df.loc[:, 'target'] = target

    if output_filename is None:
        output_filename = '{}.{}'.format(assay_id, output_format)
    print '{}\t{}\t{}\t{}'.format(assay_id, target, output_filename, len(df))
    write_dataframe(df, output_filename)