def read_targets(self): """ Get labels for molecules from SD data fields matching dataset names. Returns ------- data : dict Nested dictionary containing SMILES and targets for compounds in each dataset. Keyed by data->dataset->SMILES->target, where target is a list. """ engine = SmilesGenerator() data = {dataset: {} for dataset in self.dataset_names} skipped = [] for mol in self.read_data(): smiles = engine.get_smiles(mol) for prop in list(mol.GetPropNames()): if prop in data: score = int(mol.GetProp(prop)) if smiles not in data[prop]: data[prop][smiles] = [] data[prop][smiles].append(score) else: # skip irrelevant SD fields if prop not in skipped: skipped.append(prop) continue print 'Skipped properties:\n{}'.format('\n'.join(skipped)) return data
class TestSmilesGenerator(SmilesTests): """ Test SmilesGenerator. """ def setUp(self): """ Set up tests. """ super(TestSmilesGenerator, self).setUp() self.engine = SmilesGenerator() def test_get_smiles(self): """ Test SmilesGenerator.get_smiles. """ for mol in self.mols: smiles = self.engine.get_smiles(mol) new = Chem.MolFromSmiles(smiles) assert new.GetNumAtoms() == mol.GetNumAtoms() def test_get_smiles_3d(self): """ Test SmilesGenerator.get_smiles with stereochemistry assigned from 3D coordinates. """ # generate conformers for ibuprofen engine = conformers.ConformerGenerator() mol = engine.generate_conformers(self.mols[1]) assert mol.GetNumConformers() > 0 # check that chirality has not yet been assigned smiles = self.engine.get_smiles(mol) assert '@' not in smiles # check for absence of chirality marker chiral_types = [ Chem.ChiralType.CHI_TETRAHEDRAL_CW, Chem.ChiralType.CHI_TETRAHEDRAL_CCW ] chiral = False for atom in mol.GetAtoms(): if atom.GetChiralTag() in chiral_types: chiral = True assert not chiral # generate SMILES self.engine = SmilesGenerator(assign_stereo_from_3d=True) smiles = self.engine.get_smiles(mol) assert '@' in smiles # check for chirality marker new = Chem.MolFromSmiles(smiles) assert new.GetNumAtoms() == self.mols[1].GetNumAtoms() # check that chirality was assigned to ibuprofen chiral = False for atom in mol.GetAtoms(): if atom.GetChiralTag() in chiral_types: chiral = True assert chiral
class TestSmilesGenerator(SmilesTests): """ Test SmilesGenerator. """ def setUp(self): """ Set up tests. """ super(TestSmilesGenerator, self).setUp() self.engine = SmilesGenerator() def test_get_smiles(self): """ Test SmilesGenerator.get_smiles. """ for mol in self.mols: smiles = self.engine.get_smiles(mol) new = Chem.MolFromSmiles(smiles) assert new.GetNumAtoms() == mol.GetNumAtoms() def test_get_smiles_3d(self): """ Test SmilesGenerator.get_smiles with stereochemistry assigned from 3D coordinates. """ # generate conformers for ibuprofen engine = conformers.ConformerGenerator() mol = engine.generate_conformers(self.mols[1]) assert mol.GetNumConformers() > 0 # check that chirality has not yet been assigned smiles = self.engine.get_smiles(mol) assert '@' not in smiles # check for absence of chirality marker chiral_types = [Chem.ChiralType.CHI_TETRAHEDRAL_CW, Chem.ChiralType.CHI_TETRAHEDRAL_CCW] chiral = False for atom in mol.GetAtoms(): if atom.GetChiralTag() in chiral_types: chiral = True assert not chiral # generate SMILES self.engine = SmilesGenerator(assign_stereo_from_3d=True) smiles = self.engine.get_smiles(mol) assert '@' in smiles # check for chirality marker new = Chem.MolFromSmiles(smiles) assert new.GetNumAtoms() == self.mols[1].GetNumAtoms() # check that chirality was assigned to ibuprofen chiral = False for atom in mol.GetAtoms(): if atom.GetChiralTag() in chiral_types: chiral = True assert chiral
def setUp(self): """ Set up tests. """ smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O', 'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F'] names = ['aspirin', 'ibuprofen', 'celecoxib'] self.y = [0, 1, 0] self.mols = [] for s, n in zip(smiles, names): mol = Chem.MolFromSmiles(s) mol.SetProp('_Name', n) self.mols.append(mol) # write active and decoy files self.temp_dir = tempfile.mkdtemp() _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') active = open(self.active_filename, 'wb') decoy = open(self.decoy_filename, 'wb') for this_smiles, name, y in zip(smiles, names, self.y): data = '{}\t{}\n'.format(this_smiles, name) if y: active.write(data) else: decoy.write(data) active.close() decoy.close() _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir) # get SMILES self.engine = SmilesGenerator() self.smiles = [self.engine.get_smiles(mol) for mol in self.mols]
def test_stereo_to_3d(self): """ Test main with --stereo-to-3d. """ # generate conformers for ibuprofen engine = conformers.ConformerGenerator() self.mols[1] = engine.generate_conformers(self.mols[1]) assert self.mols[1].GetNumConformers() > 0 # rewrite actives file with 3D coordinates _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.sdf') with serial.MolWriter().open(self.active_filename) as writer: for mol, y in zip(self.mols, self.y): if y: writer.write([self.mols[1]]) # check for absence of chirality using default arguments smiles, targets = self.check_output([ '-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename ]) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert not chiral # update reference SMILES self.engine = SmilesGenerator(assign_stereo_from_3d=True) self.smiles[1] = self.engine.get_smiles(self.mols[1]) # check for presence of chiraliy using --stereo-from-3d smiles, targets = self.check_output([ '-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename, '--stereo-from-3d' ]) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert chiral
def initialize(self): """ Initialize. This is not part of __init__ because it breaks IPython.parallel. """ fd, self.config_filename = tempfile.mkstemp() os.close(fd) with open(self.config_filename, "wb") as f: f.write(self.get_config()) self.smiles_engine = SmilesGenerator(**self.smiles_engine_kwargs) self.initialized = True
def setUp(self): """ Set up tests. """ smiles = [ 'CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O', 'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F' ] names = ['aspirin', 'ibuprofen', 'celecoxib'] self.y = [0, 1, 0] self.mols = [] for s, n in zip(smiles, names): mol = Chem.MolFromSmiles(s) mol.SetProp('_Name', n) self.mols.append(mol) # write active and decoy files self.temp_dir = tempfile.mkdtemp() _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') active = open(self.active_filename, 'wb') decoy = open(self.decoy_filename, 'wb') for this_smiles, name, y in zip(smiles, names, self.y): data = '{}\t{}\n'.format(this_smiles, name) if y: active.write(data) else: decoy.write(data) active.close() decoy.close() _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir) # get SMILES self.engine = SmilesGenerator() self.smiles = [self.engine.get_smiles(mol) for mol in self.mols]
def test_stereo_to_3d(self): """ Test main with --stereo-to-3d. """ # generate conformers for ibuprofen engine = conformers.ConformerGenerator() self.mols[1] = engine.generate_conformers(self.mols[1]) assert self.mols[1].GetNumConformers() > 0 # rewrite actives file with 3D coordinates _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.sdf') with serial.MolWriter().open(self.active_filename) as writer: for mol, y in zip(self.mols, self.y): if y: writer.write([self.mols[1]]) # check for absence of chirality using default arguments smiles, targets = self.check_output( ['-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename]) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert not chiral # update reference SMILES self.engine = SmilesGenerator(assign_stereo_from_3d=True) self.smiles[1] = self.engine.get_smiles(self.mols[1]) # check for presence of chiraliy using --stereo-from-3d smiles, targets = self.check_output( ['-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename, '--stereo-from-3d']) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert chiral
def main(featurizer_class, input_filename, output_filename, target_filename=None, featurizer_kwargs=None, parallel=False, client_kwargs=None, view_flags=None, compression_level=3, smiles_hydrogens=False, include_smiles=False, scaffolds=False, chiral_scaffolds=False, mol_id_prefix=None): """ Featurize molecules in input_filename using the given featurizer. Parameters ---------- featurizer_class : Featurizer Featurizer class. input_filename : str Filename containing molecules to be featurized. output_filename : str Output filename. Should end with .pkl or .pkl.gz. target_filename : str, optional Pickle containing target values. Should either be array_like or a dict containing 'names' and 'y' keys, corresponding to molecule names and target values. featurizer_kwargs : dict, optional Keyword arguments passed to featurizer. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. smiles_hydrogens : bool, optional (default False) Whether to keep hydrogens when generating SMILES. include_smiles : bool, optional (default False) Include SMILES in output. scaffolds : bool, optional (default False) Whether to include scaffolds in output. chiral_scaffods : bool, optional (default False) Whether to include chirality in scaffolds. mol_id_prefix : str, optional Prefix for molecule IDs. """ mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix) # get targets data = {} if target_filename is not None: targets = read_pickle(target_filename) if isinstance(targets, dict): mol_indices, target_indices = collate_mols( mols, mol_ids, targets['y'], targets['mol_id']) mols = mols[mol_indices] mol_ids = mol_ids[mol_indices] targets = np.asarray(targets['y'])[target_indices] else: assert len(targets) == len(mols) data['y'] = targets # featurize molecules print "Featurizing molecules..." if featurizer_kwargs is None: featurizer_kwargs = {} featurizer = featurizer_class(**featurizer_kwargs) features = featurizer.featurize(mols, parallel, client_kwargs, view_flags) # fill in data container print "Saving results..." data['mol_id'] = mol_ids data['features'] = features # sanity checks assert data['features'].shape[0] == len(mols), ( "Features do not match molecules.") assert data['mol_id'].shape[0] == len(mols), ( "Molecule IDs do not match molecules.") # smiles, scaffolds, args if include_smiles: smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens)) data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols]) if scaffolds: data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds) # construct a DataFrame try: if data['features'].ndim > 1: # numpy arrays will be "summarized" when written as strings # use str(row.tolist())[1:-1] to remove the surrounding brackets # remove commas (keeping spaces) to avoid conflicts with csv if (output_filename.endswith('.csv') or output_filename.endswith('.csv.gz')): data['features'] = [str(row.tolist())[1:-1].replace(', ', ' ') for row in data['features']] else: data['features'] = [row for row in data['features']] except AttributeError: pass df = pd.DataFrame(data) # write output file write_output_file(df, output_filename, compression_level)
class TestClassificationTargets(unittest.TestCase): """ Tests for classification_targets.py. """ def setUp(self): """ Set up tests. """ smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O', 'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F'] names = ['aspirin', 'ibuprofen', 'celecoxib'] self.y = [0, 1, 0] self.mols = [] for s, n in zip(smiles, names): mol = Chem.MolFromSmiles(s) mol.SetProp('_Name', n) self.mols.append(mol) # write active and decoy files self.temp_dir = tempfile.mkdtemp() _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') active = open(self.active_filename, 'wb') decoy = open(self.decoy_filename, 'wb') for this_smiles, name, y in zip(smiles, names, self.y): data = '{}\t{}\n'.format(this_smiles, name) if y: active.write(data) else: decoy.write(data) active.close() decoy.close() _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir) # get SMILES self.engine = SmilesGenerator() self.smiles = [self.engine.get_smiles(mol) for mol in self.mols] def tearDown(self): """ Clean up tests. """ shutil.rmtree(self.temp_dir) def check_output(self, input_args): """ Check main output. Parameters ---------- input_args : list Command-line arguments. """ args = parse_args(input_args) main(args.actives, args.decoys, args.output, args.stereo_from_3d) data = read_pickle(self.output_filename) for smiles, target in zip(data['smiles'], data['targets']): assert smiles in self.smiles assert target == self.y[self.smiles.index(smiles)] return data['smiles'], data['targets'] def test_defaults(self): """ Test main with default parameters. """ args = ['-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename] self.check_output(args) def test_stereo_to_3d(self): """ Test main with --stereo-to-3d. """ # generate conformers for ibuprofen engine = conformers.ConformerGenerator() self.mols[1] = engine.generate_conformers(self.mols[1]) assert self.mols[1].GetNumConformers() > 0 # rewrite actives file with 3D coordinates _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.sdf') with serial.MolWriter().open(self.active_filename) as writer: for mol, y in zip(self.mols, self.y): if y: writer.write([self.mols[1]]) # check for absence of chirality using default arguments smiles, targets = self.check_output( ['-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename]) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert not chiral # update reference SMILES self.engine = SmilesGenerator(assign_stereo_from_3d=True) self.smiles[1] = self.engine.get_smiles(self.mols[1]) # check for presence of chiraliy using --stereo-from-3d smiles, targets = self.check_output( ['-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename, '--stereo-from-3d']) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert chiral
def setUp(self): """ Set up tests. """ super(TestSmilesGenerator, self).setUp() self.engine = SmilesGenerator()
class MoleculeDatabase(object): """ Molecule database. Molecules are keyed by SMILES. Parameters ---------- kwargs : dict, optional Keyword arguments for SmilesMap. """ def __init__(self, **kwargs): self.engine = SmilesGenerator(**kwargs) self.smiles = set() def __len__(self): return len(self.smiles) def __iter__(self): for smiles in self.smiles: yield smiles def __contains__(self, item): return item in self.smiles def add_mol(self, mol): """ Add a molecule to the database. Parameters ---------- mol : RDKit Mol Molecule. """ self.smiles.add(self.engine.get_smiles(mol)) def load(self, filename): """ Load an existing database. Parameters ---------- filename : str Existing database filename. """ if filename.endswith('.gz'): f = gzip.open(filename) else: f = open(filename) for line in f: smiles = line.strip() mol = Chem.MolFromSmiles(smiles) # sanity check if mol is None: raise ValueError( 'Database is unreadable: "{}".'.format(smiles)) self.smiles.add(smiles) f.close() def save(self, filename): """ Save the database to disk. Parameters ---------- filename : str Filename. """ if filename.endswith('.gz'): f = gzip.open(filename, 'wb') else: f = open(filename, 'wb') for smiles in self.smiles: f.write('{}\n'.format(smiles)) f.close()
class TestClassificationTargets(unittest.TestCase): """ Tests for classification_targets.py. """ def setUp(self): """ Set up tests. """ smiles = [ 'CC(=O)OC1=CC=CC=C1C(=O)O', 'CC(C)CC1=CC=C(C=C1)C(C)C(=O)O', 'CC1=CC=C(C=C1)C2=CC(=NN2C3=CC=C(C=C3)S(=O)(=O)N)C(F)(F)F' ] names = ['aspirin', 'ibuprofen', 'celecoxib'] self.y = [0, 1, 0] self.mols = [] for s, n in zip(smiles, names): mol = Chem.MolFromSmiles(s) mol.SetProp('_Name', n) self.mols.append(mol) # write active and decoy files self.temp_dir = tempfile.mkdtemp() _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') _, self.decoy_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.smi') active = open(self.active_filename, 'wb') decoy = open(self.decoy_filename, 'wb') for this_smiles, name, y in zip(smiles, names, self.y): data = '{}\t{}\n'.format(this_smiles, name) if y: active.write(data) else: decoy.write(data) active.close() decoy.close() _, self.output_filename = tempfile.mkstemp(dir=self.temp_dir) # get SMILES self.engine = SmilesGenerator() self.smiles = [self.engine.get_smiles(mol) for mol in self.mols] def tearDown(self): """ Clean up tests. """ shutil.rmtree(self.temp_dir) def check_output(self, input_args): """ Check main output. Parameters ---------- input_args : list Command-line arguments. """ args = parse_args(input_args) main(args.actives, args.decoys, args.output, args.stereo_from_3d) data = read_pickle(self.output_filename) for smiles, target in zip(data['smiles'], data['targets']): assert smiles in self.smiles assert target == self.y[self.smiles.index(smiles)] return data['smiles'], data['targets'] def test_defaults(self): """ Test main with default parameters. """ args = [ '-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename ] self.check_output(args) def test_stereo_to_3d(self): """ Test main with --stereo-to-3d. """ # generate conformers for ibuprofen engine = conformers.ConformerGenerator() self.mols[1] = engine.generate_conformers(self.mols[1]) assert self.mols[1].GetNumConformers() > 0 # rewrite actives file with 3D coordinates _, self.active_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.sdf') with serial.MolWriter().open(self.active_filename) as writer: for mol, y in zip(self.mols, self.y): if y: writer.write([self.mols[1]]) # check for absence of chirality using default arguments smiles, targets = self.check_output([ '-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename ]) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert not chiral # update reference SMILES self.engine = SmilesGenerator(assign_stereo_from_3d=True) self.smiles[1] = self.engine.get_smiles(self.mols[1]) # check for presence of chiraliy using --stereo-from-3d smiles, targets = self.check_output([ '-a', self.active_filename, '-d', self.decoy_filename, '-o', self.output_filename, '--stereo-from-3d' ]) chiral = False for this_smiles in smiles: if '@' in this_smiles: chiral = True assert chiral
def __init__(self, **kwargs): self.engine = SmilesGenerator(**kwargs) self.smiles = set()
class Dragon(object): """ Wrapper for dragon6shell. Parameters ---------- subset : str, optional (default '2d') Descriptor subset. kwargs : dict, optional Keyword arguments for SmilesGenerator. """ def __init__(self, subset="2d", **kwargs): self.subset = subset self.initialized = False self.config_filename, self.smiles_engine = None, None self.smiles_engine_kwargs = kwargs def initialize(self): """ Initialize. This is not part of __init__ because it breaks IPython.parallel. """ fd, self.config_filename = tempfile.mkstemp() os.close(fd) with open(self.config_filename, "wb") as f: f.write(self.get_config()) self.smiles_engine = SmilesGenerator(**self.smiles_engine_kwargs) self.initialized = True def __del__(self): """ Cleanup. """ if self.config_filename is not None: os.unlink(self.config_filename) def get_config(self): """ Get configuration file. """ if self.subset == "2d": return """<?xml version="1.0" encoding="utf-8"?> <DRAGON version="6.0.36" script_version="1" generation_date="2014/11/17"> <OPTIONS> <CheckUpdates value="true"/> <SaveLayout value="true"/> <ShowWorksheet value="false"/> <Decimal_Separator value="."/> <Missing_String value="NaN"/> <DefaultMolFormat value="1"/> <HelpBrowser value="/usr/bin/xdg-open"/> <RejectUnusualValence value="false"/> <Add2DHydrogens value="false"/> <MaxSRforAllCircuit value="19"/> <MaxSR value="35"/> <MaxSRDetour value="30"/> <MaxAtomWalkPath value="2000"/> <LogPathWalk value="true"/> <LogEdge value="true"/> <Weights> <weight name="Mass"/> <weight name="VdWVolume"/> <weight name="Electronegativity"/> <weight name="Polarizability"/> <weight name="Ionization"/> <weight name="I-State"/> </Weights> <SaveOnlyData value="false"/> <SaveLabelsOnSeparateFile value="false"/> <SaveFormatBlock value="%b - %n.txt"/> <SaveFormatSubBlock value="%b-%s - %n - %m.txt"/> <SaveExcludeMisVal value="false"/> <SaveExcludeAllMisVal value="false"/> <SaveExcludeConst value="false"/> <SaveExcludeNearConst value="false"/> <SaveExcludeStdDev value="false"/> <SaveStdDevThreshold value="0.0001"/> <SaveExcludeCorrelated value="false"/> <SaveCorrThreshold value="0.95"/> <SaveExclusionOptionsToVariables value="false"/> <SaveExcludeMisMolecules value="false"/> <SaveExcludeRejectedMolecules value="false"/> </OPTIONS> <DESCRIPTORS> <block id="1" SelectAll="true"/> <block id="2" SelectAll="true"/> <block id="3" SelectAll="true"/> <block id="4" SelectAll="true"/> <block id="5" SelectAll="true"/> <block id="6" SelectAll="true"/> <block id="7" SelectAll="true"/> <block id="8" SelectAll="true"/> <block id="9" SelectAll="true"/> <block id="10" SelectAll="true"/> <block id="11" SelectAll="true"/> <block id="12" SelectAll="true"/> <block id="21" SelectAll="true"/> <block id="22" SelectAll="true"/> <block id="23" SelectAll="true"/> <block id="24" SelectAll="true"/> <block id="25" SelectAll="true"/> <block id="28" SelectAll="true"/> <block id="29" SelectAll="true"/> </DESCRIPTORS> <MOLFILES> <molInput value="stdin"/> <molInputFormat value="SMILES"/> </MOLFILES> <OUTPUT> <SaveStdOut value="true"/> <SaveProject value="false"/> <SaveFile value="false"/> <logMode value="stderr"/> </OUTPUT> </DRAGON> """ else: raise NotImplementedError def get_descriptors(self, mols): """ Parameters ---------- mols : array_like Molecules. """ if not self.initialized: self.initialize() smiles = [self.smiles_engine.get_smiles(mol) for mol in mols] args = ["dragon6shell", "-s", self.config_filename] p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate("\n".join(smiles)) if not stdout: raise RuntimeError(stderr) data, names = self.parse_descriptors(stdout) # adjust for skipped molecules # descriptors are in same order as smiles missing = np.setdiff1d(smiles, names) features = np.zeros(len(smiles), dtype=object) idx = 0 # index into calculated features for i, this_smiles in enumerate(smiles): if this_smiles in missing: features[i] = None else: assert this_smiles == names[idx] # confirm match features[i] = data[idx] idx += 1 assert len(features) == len(mols) return features def parse_descriptors(self, string): """ Parse Dragon descriptors. Parameters ---------- string : str Output from dragon6shell. """ df = pd.read_table(StringIO(string)) if self.subset == "2d": del df["nHBonds"], df["Psi_e_1d"], df["Psi_e_1s"] # extract names names = df["NAME"].values # delete No. and NAME columns del df["No."], df["NAME"] return np.asarray(df, dtype=float), names
def main(featurizer_class, input_filename, output_filename, target_filename=None, featurizer_kwargs=None, parallel=False, client_kwargs=None, view_flags=None, compression_level=3, smiles_hydrogens=False, include_smiles=False, scaffolds=False, chiral_scaffolds=False, mol_id_prefix=None): """ Featurize molecules in input_filename using the given featurizer. Parameters ---------- featurizer_class : Featurizer Featurizer class. input_filename : str Filename containing molecules to be featurized. output_filename : str Output filename. Should end with .pkl or .pkl.gz. target_filename : str, optional Pickle containing target values. Should either be array_like or a dict containing 'names' and 'y' keys, corresponding to molecule names and target values. featurizer_kwargs : dict, optional Keyword arguments passed to featurizer. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. smiles_hydrogens : bool, optional (default False) Whether to keep hydrogens when generating SMILES. include_smiles : bool, optional (default False) Include SMILES in output. scaffolds : bool, optional (default False) Whether to include scaffolds in output. chiral_scaffods : bool, optional (default False) Whether to include chirality in scaffolds. mol_id_prefix : str, optional Prefix for molecule IDs. """ mols, mol_ids = read_mols(input_filename, mol_id_prefix=mol_id_prefix) # get targets data = {} if target_filename is not None: targets = read_pickle(target_filename) if isinstance(targets, dict): mol_indices, target_indices = collate_mols(mols, mol_ids, targets['y'], targets['mol_id']) mols = mols[mol_indices] mol_ids = mol_ids[mol_indices] targets = np.asarray(targets['y'])[target_indices] else: assert len(targets) == len(mols) data['y'] = targets # featurize molecules print "Featurizing molecules..." if featurizer_kwargs is None: featurizer_kwargs = {} featurizer = featurizer_class(**featurizer_kwargs) features = featurizer.featurize(mols, parallel, client_kwargs, view_flags) # fill in data container print "Saving results..." data['mol_id'] = mol_ids data['features'] = features # sanity checks assert data['features'].shape[0] == len(mols), ( "Features do not match molecules.") assert data['mol_id'].shape[0] == len(mols), ( "Molecule IDs do not match molecules.") # smiles, scaffolds, args if include_smiles: smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens)) data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols]) if scaffolds: data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds) # construct a DataFrame try: if data['features'].ndim > 1: # numpy arrays will be "summarized" when written as strings # use str(row.tolist())[1:-1] to remove the surrounding brackets # remove commas (keeping spaces) to avoid conflicts with csv if (output_filename.endswith('.csv') or output_filename.endswith('.csv.gz')): data['features'] = [ str(row.tolist())[1:-1].replace(', ', ' ') for row in data['features'] ] else: data['features'] = [row for row in data['features']] except AttributeError: pass df = pd.DataFrame(data) # write output file write_output_file(df, output_filename, compression_level)
def main(featurizer_class, input_filename, output_filename, target_filename=None, featurizer_kwargs=None, parallel=False, client_kwargs=None, view_flags=None, compression_level=3, smiles_hydrogens=False, names=False, scaffolds=False, chiral_scaffolds=False): """ Featurize molecules in input_filename using the given featurizer. Parameters ---------- featurizer_class : Featurizer Featurizer class. input_filename : str Filename containing molecules to be featurized. output_filename : str Output filename. Should end with .pkl or .pkl.gz. target_filename : str, optional Pickle containing target values. Should either be array_like or a dict containing 'names' and 'y' keys, corresponding to molecule names and target values. featurizer_kwargs : dict, optional Keyword arguments passed to featurizer. parallel : bool, optional Whether to train subtrainers in parallel using IPython.parallel (default False). client_kwargs : dict, optional Keyword arguments for IPython.parallel Client. view_flags : dict, optional Flags for IPython.parallel LoadBalancedView. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. smiles_hydrogens : bool, optional (default False) Whether to keep hydrogens when generating SMILES. names : bool, optional (default False) Whether to include molecule names in output. scaffolds : bool, optional (default False) Whether to include scaffolds in output. chiral_scaffods : bool, optional (default False) Whether to include chirality in scaffolds. """ mols, mol_names = read_mols(input_filename) # get targets data = {} if target_filename is not None: targets = read_pickle(target_filename) if isinstance(targets, dict): mol_indices, target_indices = collate_mols( mols, mol_names, targets['y'], targets['names']) mols = mols[mol_indices] mol_names = mol_names[mol_indices] targets = np.asarray(targets['y'])[target_indices] else: assert len(targets) == len(mols) data['y'] = targets # featurize molecules print "Featurizing molecules..." if featurizer_kwargs is None: featurizer_kwargs = {} featurizer = featurizer_class(**featurizer_kwargs) features = featurizer.featurize(mols, parallel, client_kwargs, view_flags) # fill in data container print "Saving results..." data['features'] = features # calculate SMILES smiles = SmilesGenerator(remove_hydrogens=(not smiles_hydrogens)) data['smiles'] = np.asarray([smiles.get_smiles(mol) for mol in mols]) # sanity checks assert data['features'].shape[0] == len(mols), ( "Features do not match molecules.") assert data['smiles'].shape[0] == len(mols), ( "SMILES do not match molecules.") # names, scaffolds, args if names: data['names'] = mol_names if scaffolds: data['scaffolds'] = get_scaffolds(mols, chiral_scaffolds) data['args'] = {'featurizer_class': featurizer_class.__name__, 'input_filename': input_filename, 'target_filename': target_filename, 'featurizer_kwargs': featurizer_kwargs, 'chiral_scaffolds': chiral_scaffolds} # write output file write_output_file(data, output_filename, compression_level)