def prepare_dataset(sdffile, dest=None, rename=True, conformations=False, overwrite=False): """ This method bootstraps the analysis of Ames data. - Rename the compounds - Merge train/test - Generate 3D conformations - Save "master" and "saliviewer" tables - Redirects stdout/stderr to a "prepare.log" file """ root, name = op.split(sdffile) name = op.splitext(name)[0] if not dest: dest = root dest_sdf = op.join(dest, name + '-prepared.sdf') master_table = op.join(dest, name + '-prepared-master.csv') sali_table = op.join(dest, name + '-prepared-saliviewer.csv') if op.exists(dest_sdf) and not overwrite: print '%s is already there and not overwriting requested' % dest_sdf else: print 'Reading %s' % sdffile mols = list(pybel.readfile('sdf', sdffile)) print '\tCreating dataset root: %s' % dest if not op.exists(dest): os.makedirs(dest) if rename: print '\tRenaming the compounds to keep track of the provenance' rename_mols_by_index(mols, name + '-') if conformations: print '\tGenerating conformations' for mol in mols: if not any(name in mol.title for name in ('train-3988', 'train-4205', 'dsstox-4205', 'dsstox-4206')): try: print 'Conformation for %s' % mol.title mol.make3D() except Exception: print 'Error computing a 3D conformation for %s' % mol.title print '\tSaving compounds' save_mols(mols, dest_sdf) print '\tCreating \"master\" table: %s' % master_table create_master_table(dest_sdf, master_table, fields=['Activity']) print '\tCreating \"saliviewer\" table: %s' % sali_table create_saliviewer_input(master_table, sali_table) return dest_sdf, master_table
mols_dsstox = list(pybel.readfile('sdf', dsstox_original)) print 'Num molecules ames=%d, bursi=%d, dsstox=%d' % (len(mols_ames), len(mols_bursi), len(mols_dsstox)) #The activity is always stored in the same field for mol in mols_bursi: activity = '1' if mol.data['Ames test categorisation'] == 'mutagen' else '0' mol.data['Activity'] = activity mol.data['CAS_NO'] = mol.title for mol in mols_dsstox: mol.data['Activity'] = mol.data['Tox'] mol.data['CAS_NO'] = mol.data['CAS'] print '\tRenaming the compounds to keep track of the provenance' rename_mols_by_index(mols_ames, 'ames-') rename_mols_by_index(mols_bursi, 'bursi-') rename_mols_by_index(mols_dsstox, 'dsstox-') print '\tComputing and analyzing the union of the datasets' cas_dupes = duplicates_by_field(mols_ames + mols_bursi + mols_dsstox) #inchi_dupes = duplicates_by_format(mols_ames + mols_bursi + mols_dsstox,) can_dupes = duplicates_by_format(mols_ames + mols_bursi + mols_dsstox, 'can') #Quick and dirty retrieval of compounds for unit-tests #Canonical smiles that are different due to # - missing hydrogens (report) # - bad perception of stereochemistry # - charges # - ... can_dupes2 = {}