def test_random_proj(): """ Verify fetching data from chemblDB when the input is a pandas df. """ _create_context() n_molecules, dao, mol_df = _fetch_chembl_test_dataset() wf = GpuWorkflowRandomProjection(n_molecules=n_molecules, dao=dao) wf.cluster(df_mol_embedding=mol_df)
def test_gpukmeansumap_dask(): """ Verify fetching data from chemblDB when the input is a pandas df. """ _create_context() n_molecules, dao, mol_df = _fetch_chembl_test_dataset() wf = GpuKmeansUmap(n_molecules=n_molecules, dao=dao, pca_comps=64) wf.cluster(df_mol_embedding=mol_df)
def test_run_benchmark(benchmark_config_list): output_dir = tempfile.tempdir output_file = os.path.join(output_dir, 'benchmark.csv') initialize_logfile(output_file) max_n_mol = 0 for config in benchmark_config_list: test_type = config['test_type'] use_gpu = config['use_gpu'] n_workers = config['n_workers'] n_mol = config['n_mol'] max_n_mol = max(max_n_mol, n_mol) context = _create_context(use_gpu=use_gpu, n_workers=n_workers, benchmark_file=output_file) if (not use_gpu): context.compute_type = 'cpu' context.n_molecule = n_mol context.cache_directory = None context.is_benchmark = True wf_class = locate(test_type) workflow = wf_class() workflow.cluster() workflow.compute_qa_matric() context.dask_client.cluster.close() context.dask_client.close() context.dask_client = None # Filename is set in workflow -- move to create randomized name temp_file = tempfile.NamedTemporaryFile(prefix='benchmark_', suffix='.csv', dir=output_dir, delete=False).name shutil.move(output_file, temp_file) assert os.path.exists(temp_file) benchmark_results = pd.read_csv(temp_file, comment='#') logger.info(benchmark_results) nrows, ncols = benchmark_results.shape assert ncols == 8 assert nrows >= len(benchmark_config_list) # assert benchmark_results['`n_molecules`'].min() > 0 # assert benchmark_results['n_molecules'].min() < max_n_mol df, machine_config = prepare_benchmark_df(temp_file) basename = os.path.splitext(temp_file)[0] excel_file = basename + '.xlsx' assert os.path.exists(excel_file) md_file = basename + '.md' assert os.path.exists(md_file) png_file = basename + '.png' prepare_acceleration_stacked_plot(df, machine_config, output_path=png_file) assert os.path.exists(png_file)
def test_add_molecule_GpuKmeansUmap(): """ Verify fetching data from chemblDB when the input is a cudf df. """ _create_context() n_molecules, dao, mol_df = _fetch_chembl_test_dataset() if hasattr(mol_df, 'compute'): mol_df = mol_df.compute() mol_df = cudf.from_pandas(mol_df) n_molecules = mol_df.shape[0] # test mol should container aviable and new molecules test_mol = mol_df[n_molecules - 20:] mols_tobe_added = test_mol['id'].to_array().tolist() chData = ChEmblData() logger.info('Fetching ChEMBLLE id for %s', mols_tobe_added) mols_tobe_added = [ str(row[0]) for row in chData.fetch_chemblId_by_molregno(mols_tobe_added) ] logger.info('ChEMBL ids to be added %s', mols_tobe_added) # Molecules to be used for clustering mol_df = mol_df[:n_molecules - 10] wf = GpuKmeansUmap(n_molecules=n_molecules, dao=dao, pca_comps=64) wf.cluster(df_mol_embedding=mol_df) missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added) assert len( missing_mols ) == 10, 'Expected 10 missing molecules found %d' % len(missing_mols) # TODO: Once the issue with add_molecule in multi-gpu env. is fixed, the # number of missing_molregno found should be 0 missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added) assert len( missing_mols ) == 0, 'Expected no missing molecules found %d' % len(missing_mols)
def generate(): wf = MolBART() num_to_add = 21 def _generate(data): smiles = data['canonical_smiles'] lipinski_score, qed = score_molecule(smiles) num_to_generate = 40 lipinski_scores = [] qed_scores = [] valid_list = [] try: if lipinski_score >= 3 and qed <= LipinskiRuleOfFiveDecorator.MAX_QED: generated_list = wf.find_similars_smiles_list( smiles, num_requested=num_to_generate, radius=0.0001) for new_smiles in generated_list: lipinski_score, qed = score_molecule(new_smiles) if lipinski_score >= 3 and qed <= LipinskiRuleOfFiveDecorator.MAX_QED: valid_list.append(new_smiles) lipinski_scores.append(lipinski_score) qed_scores.append(qed) if len(valid_list) >= num_to_add: break except Exception as ex: pass valid_list += [''] * ((num_to_add) - len(valid_list)) lipinski_scores += [0] * (num_to_add - len(lipinski_scores)) qed_scores += [0] * (num_to_add - len(qed_scores)) return valid_list + lipinski_scores + qed_scores data = pandas.read_csv( '/workspace/tests/data/benchmark_approved_drugs.csv') prop_meta = dict( zip([i for i in range(num_to_add)], [pandas.Series([], dtype='object') for i in range(num_to_add)])) prop_meta.update( dict( zip([num_to_add + i for i in range(num_to_add)], [pandas.Series([], dtype='int8') for i in range(num_to_add)]))) prop_meta.update( dict( zip([ (2 * num_to_add) + i for i in range(num_to_add) ], [pandas.Series([], dtype='float64') for i in range(num_to_add)]))) meta_df = pandas.DataFrame(prop_meta) _create_context() ddf = dd.from_pandas(data, npartitions=4 * multiprocessing.cpu_count()) ddf = ddf.map_partitions( lambda dframe: dframe.apply(_generate, result_type='expand', axis=1), meta=meta_df) ddf = ddf.compute(scheduler='processes') ddf.to_csv("/workspace/similar_mols.csv")