예제 #1
0
def test_random_proj():
    """
    Verify fetching data from chemblDB when the input is a pandas df.
    """
    _create_context()

    n_molecules, dao, mol_df = _fetch_chembl_test_dataset()

    wf = GpuWorkflowRandomProjection(n_molecules=n_molecules, dao=dao)
    wf.cluster(df_mol_embedding=mol_df)
예제 #2
0
def test_gpukmeansumap_dask():
    """
    Verify fetching data from chemblDB when the input is a pandas df.
    """
    _create_context()

    n_molecules, dao, mol_df = _fetch_chembl_test_dataset()

    wf = GpuKmeansUmap(n_molecules=n_molecules, dao=dao, pca_comps=64)
    wf.cluster(df_mol_embedding=mol_df)
예제 #3
0
def test_run_benchmark(benchmark_config_list):
    output_dir = tempfile.tempdir
    output_file = os.path.join(output_dir, 'benchmark.csv')
    initialize_logfile(output_file)

    max_n_mol = 0
    for config in benchmark_config_list:
        test_type = config['test_type']
        use_gpu = config['use_gpu']
        n_workers = config['n_workers']
        n_mol = config['n_mol']
        max_n_mol = max(max_n_mol, n_mol)

        context = _create_context(use_gpu=use_gpu,
                                  n_workers=n_workers,
                                  benchmark_file=output_file)

        if (not use_gpu):
            context.compute_type = 'cpu'
        context.n_molecule = n_mol
        context.cache_directory = None
        context.is_benchmark = True

        wf_class = locate(test_type)
        workflow = wf_class()

        workflow.cluster()
        workflow.compute_qa_matric()

        context.dask_client.cluster.close()
        context.dask_client.close()
        context.dask_client = None

    # Filename is set in workflow -- move to create randomized name
    temp_file = tempfile.NamedTemporaryFile(prefix='benchmark_',
                                            suffix='.csv',
                                            dir=output_dir,
                                            delete=False).name
    shutil.move(output_file, temp_file)
    assert os.path.exists(temp_file)

    benchmark_results = pd.read_csv(temp_file, comment='#')
    logger.info(benchmark_results)

    nrows, ncols = benchmark_results.shape
    assert ncols == 8
    assert nrows >= len(benchmark_config_list)
    # assert benchmark_results['`n_molecules`'].min() > 0
    # assert benchmark_results['n_molecules'].min() < max_n_mol

    df, machine_config = prepare_benchmark_df(temp_file)
    basename = os.path.splitext(temp_file)[0]
    excel_file = basename + '.xlsx'
    assert os.path.exists(excel_file)
    md_file = basename + '.md'
    assert os.path.exists(md_file)

    png_file = basename + '.png'
    prepare_acceleration_stacked_plot(df, machine_config, output_path=png_file)
    assert os.path.exists(png_file)
예제 #4
0
def test_add_molecule_GpuKmeansUmap():
    """
    Verify fetching data from chemblDB when the input is a cudf df.
    """
    _create_context()

    n_molecules, dao, mol_df = _fetch_chembl_test_dataset()

    if hasattr(mol_df, 'compute'):
        mol_df = mol_df.compute()

    mol_df = cudf.from_pandas(mol_df)
    n_molecules = mol_df.shape[0]

    # test mol should container aviable and new molecules
    test_mol = mol_df[n_molecules - 20:]
    mols_tobe_added = test_mol['id'].to_array().tolist()

    chData = ChEmblData()
    logger.info('Fetching ChEMBLLE id for %s', mols_tobe_added)
    mols_tobe_added = [
        str(row[0])
        for row in chData.fetch_chemblId_by_molregno(mols_tobe_added)
    ]
    logger.info('ChEMBL ids to be added %s', mols_tobe_added)

    # Molecules to be used for clustering
    mol_df = mol_df[:n_molecules - 10]

    wf = GpuKmeansUmap(n_molecules=n_molecules, dao=dao, pca_comps=64)
    wf.cluster(df_mol_embedding=mol_df)

    missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added)
    assert len(
        missing_mols
    ) == 10, 'Expected 10 missing molecules found %d' % len(missing_mols)

    # TODO: Once the issue with add_molecule in multi-gpu env. is fixed, the
    # number of missing_molregno found should be 0
    missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added)
    assert len(
        missing_mols
    ) == 0, 'Expected no missing molecules found %d' % len(missing_mols)
예제 #5
0
def generate():
    wf = MolBART()
    num_to_add = 21

    def _generate(data):
        smiles = data['canonical_smiles']
        lipinski_score, qed = score_molecule(smiles)

        num_to_generate = 40
        lipinski_scores = []
        qed_scores = []
        valid_list = []

        try:
            if lipinski_score >= 3 and qed <= LipinskiRuleOfFiveDecorator.MAX_QED:
                generated_list = wf.find_similars_smiles_list(
                    smiles, num_requested=num_to_generate, radius=0.0001)
                for new_smiles in generated_list:
                    lipinski_score, qed = score_molecule(new_smiles)

                    if lipinski_score >= 3 and qed <= LipinskiRuleOfFiveDecorator.MAX_QED:
                        valid_list.append(new_smiles)
                        lipinski_scores.append(lipinski_score)
                        qed_scores.append(qed)

                    if len(valid_list) >= num_to_add:
                        break
        except Exception as ex:
            pass

        valid_list += [''] * ((num_to_add) - len(valid_list))
        lipinski_scores += [0] * (num_to_add - len(lipinski_scores))
        qed_scores += [0] * (num_to_add - len(qed_scores))

        return valid_list + lipinski_scores + qed_scores

    data = pandas.read_csv(
        '/workspace/tests/data/benchmark_approved_drugs.csv')

    prop_meta = dict(
        zip([i for i in range(num_to_add)],
            [pandas.Series([], dtype='object') for i in range(num_to_add)]))
    prop_meta.update(
        dict(
            zip([num_to_add + i for i in range(num_to_add)],
                [pandas.Series([], dtype='int8') for i in range(num_to_add)])))
    prop_meta.update(
        dict(
            zip([
                (2 * num_to_add) + i for i in range(num_to_add)
            ], [pandas.Series([], dtype='float64')
                for i in range(num_to_add)])))
    meta_df = pandas.DataFrame(prop_meta)

    _create_context()
    ddf = dd.from_pandas(data, npartitions=4 * multiprocessing.cpu_count())
    ddf = ddf.map_partitions(
        lambda dframe: dframe.apply(_generate, result_type='expand', axis=1),
        meta=meta_df)
    ddf = ddf.compute(scheduler='processes')
    ddf.to_csv("/workspace/similar_mols.csv")