Exemplo n.º 1
0
    def split(self,
              dataset,
              seed=None,
              frac_train=.8,
              frac_valid=.1,
              frac_test=.1,
              log_every_n=None):
        """
    Splits internal compounds randomly into train/validation/test.
    """
        np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
        if seed is None:
            seed = random.randint(0, 2**30)
        np.random.seed(seed)

        num_datapoints = len(dataset)

        train_cutoff = int(frac_train * num_datapoints)
        valid_cutoff = int((frac_train + frac_valid) * num_datapoints)

        num_train = train_cutoff
        num_valid = valid_cutoff - train_cutoff
        num_test = num_datapoints - valid_cutoff

        all_mols = []
        for ind, smiles in enumerate(dataset.ids):
            all_mols.append(Chem.MolFromSmiles(smiles))

        fps = [
            AllChem.GetMorganFingerprintAsBitVect(x, 2, 1024) for x in all_mols
        ]

        def distance(i, j):
            return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

        picker = MaxMinPicker()
        testIndices = picker.LazyPick(distFunc=distance,
                                      poolSize=num_datapoints,
                                      pickSize=num_test,
                                      seed=seed)

        validTestIndices = picker.LazyPick(distFunc=distance,
                                           poolSize=num_datapoints,
                                           pickSize=num_valid + num_test,
                                           firstPicks=testIndices,
                                           seed=seed)

        allSet = set(range(num_datapoints))
        testSet = set(testIndices)
        validSet = set(validTestIndices) - testSet

        trainSet = allSet - testSet - validSet

        assert len(testSet & validSet) == 0
        assert len(testSet & trainSet) == 0
        assert len(validSet & trainSet) == 0
        assert (validSet | trainSet | testSet) == allSet

        return sorted(list(trainSet)), sorted(list(validSet)), sorted(
            list(testSet))
Exemplo n.º 2
0
def diverse_mols_indexes(mol_list, n_pick, radius=4, seed=42):
    fps = [GetMorganFingerprint(mol, radius) for mol in mol_list]
    picker = MaxMinPicker()
    n_fps = len(fps)

    def fp_distance(i, j): return 1 - \
        DataStructs.DiceSimilarity(fps[i], fps[j])
    indexes = picker.LazyPick(fp_distance, n_fps, n_pick, seed=seed)
    return indexes
Exemplo n.º 3
0
def pick_diverse(
    mols: List[Chem.rdchem.Mol],
    npick: int,
    initial_picks: List[int] = None,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of diverse molecules based on they fingerprint.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been picked
        mols: molecules that have been picked
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    picker = MaxMinPicker()
    initial_picks = [] if initial_picks is None else initial_picks
    picked_inds = picker.LazyPick(dist_fn,
                                  len(mols),
                                  npick,
                                  firstPicks=initial_picks,
                                  seed=seed)
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols
Exemplo n.º 4
0
    def query(self, n):
        idxs_unlabeled = np.arange(self.n_pool)[~self.idxs_lb]
        if self.args.data_pool is not None:
            idxs_unlabeled = np.random.choice(idxs_unlabeled,
                                              self.args.data_pool,
                                              replace=False)

        embedding = self.get_embedding(
            MoleculeDataset(self.data[idxs_unlabeled]))

        def distij(i, j, data=embedding):
            return sum(
                np.sqrt(np.square(np.array(data[i]) - np.array(data[j]))))

        picker = MaxMinPicker()
        pickIndices = picker.LazyPick(distij, embedding.shape[0], n)

        return idxs_unlabeled[pickIndices]
def SelectMoleculesUsingMaxMin(Mols, MolsFingerprints):
    """Select diverse molecules using MaxMin methodology."""

    MiscUtil.PrintInfo(
        "\nSelecting diverse molecules using MaxMin methodology and %s similarity metric..."
        % OptionsInfo["SimilarityMetric"])

    DiverseMols = []

    PoolSize = len(MolsFingerprints)
    PickSize = OptionsInfo["NumMols"]
    SimilarityFunction = OptionsInfo["SimilarityFunction"]

    Picker = MaxMinPicker()
    PairwiseDistance = lambda i, j: 1 - SimilarityFunction(
        MolsFingerprints[i], MolsFingerprints[j])

    MolIndices = Picker.LazyPick(PairwiseDistance, PoolSize, PickSize)

    for Index in list(MolIndices):
        DiverseMols.append(Mols[Index])

    return DiverseMols
Exemplo n.º 6
0
def pick_subset(mols, num=5, radius=3, seed=-1):
    """
    Pick a disparate subset of molecules using Morgan Fingerprints.
    https://towardsdatascience.com/a-practical-introduction-to-the-use-of-molecular-fingerprints-in-drug-discovery-7f15021be2b1

    :param mols: an iterable of molecules
    :param num: number of molecules to pick
    :param radius:
    :return: list of integer locations of the subset of molecules
    """
    fps = [GetMorganFingerprint(mol, radius) for mol in mols]

    def distij(i, j, fps=fps):
        return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

    return list(MaxMinPicker().LazyPick(distij, len(fps), num, seed=seed))
Exemplo n.º 7
0
    def diversity_pick(self, n, firstpicks=[]):
        """
		Picks a maximally diverse subset of a sert of molecules using the RDKit
		MaxMinPicker. Optionally, a list of names with already
		chosen molecules can be specified.
		"""
        assert type(firstpicks) in [
            list, set
        ], 'Error, the input firstpicks must be a list or set'
        assert all([
            type(s) is str for s in firstpicks
        ]), 'Error, each item in the input list firstpicks must be a string'

        assert type(n) is int, 'Error, the input n must be an integer'

        firstpicks = [clean_name(s) for s in firstpicks]
        assert all([
            s in self.input_names for s in firstpicks
        ]), 'Error, not all firstpicks are part of the molecule list'
        assert n < len(self.input_names) - len(
            firstpicks
        ), 'Error, you have specified an n that is greater or equal to the available molecule number'

        # get indices of already picked molecules
        ind = []
        for x in firstpicks:
            ind.append(
                self.input_names.index(x))  # indices of picked molecules

        # compute all pairwise similarity scores
        ds = []
        score = self.metrics[self.metric]
        for i in range(1, len(self.fingerprint_data)):
            ds.extend(
                score(self.fingerprint_data[i],
                      self.fingerprint_data[:i],
                      returnDistance=True))

        # make the selection (returns indeces)
        ids = MaxMinPicker().Pick(np.array(ds), len(self.fingerprint_data), n,
                                  ind)

        return [self.input_names[s] for s in ids]
Exemplo n.º 8
0
#This will hold a series of dicts of metrics that we then build into a dataframe
metric_dict_list = []
#using longest most imformantive embeddings

classifier_dict = {
    'SVM': train_SVM,
    'RF': train_RF,
    'LGBM': train_LGBM,
    'DNN': train_DNN,
    'GCNN_pytorch': train_PyTorchGCNN
}
#model_list = ['GCNN_pytorch','SVM','RF','LGBM','DNN']
model_list = ['RF']

num_models = len(model_list)
mmp = MaxMinPicker()
#define how we select after inital training run
selection_type = 'Diverse'
#define size of iter after first 10% train relative to that trainsize
iterRel2Start = 0.5
end_iter = 1 + (4 / iterRel2Start)
for AID in AID_list:
    for model_type in ['SVM']:
        if 'win' in sys.platform:
            AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data',
                                    AID)
        else:
            AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/',
                                    AID)
        save_path = AID_path + '/' + AID + 'graph_processed.pkl'
        pickle_off = open(save_path, 'rb')
Exemplo n.º 9
0
classifier_dict = {
    'SVM': train_SVM,
    'RF': train_RF,
    'LGBM': train_LGBM,
    'DNN': train_DNN,
    'GCNN_pytorch': train_PyTorchGCNN,
    'random': train_random_classifier
}

model_list = ['GCNN_pytorch', 'SVM', 'RF', 'LGBM', 'DNN', 'random']
#model_list = ['RF']
#model_list = ['GCNN_pytorch']
#model_list = ['SVM']

num_models = len(model_list)
mmp = MaxMinPicker()


def getRandomIterInds(firstPicksList, fplist, bottom_to_select):
    full_list_index = np.arange(len(fplist))
    unselected_inds = list(set(full_list_index) - set(firstPicksList))
    random_selection = np.random.choice(unselected_inds,
                                        bottom_to_select,
                                        replace=False)
    start_indexs = np.concatenate((firstPicksList, random_selection), axis=0)
    return start_indexs


def getNextIterInds(firstPicksList, fplist, bottom_to_select):
    diverse_picks = mmp.LazyBitVectorPick(
        fplist, len(fplist),
Exemplo n.º 10
0
    print(novel / len(smiles_list))

    ## Diversity sampling

    from rdkit import Chem
    from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
    from rdkit import DataStructs
    from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

    ms = [Chem.MolFromSmiles(s) for s in smiles_list]
    start = time()
    fps = [GetMorganFingerprint(x, 3) for x in ms]
    nfps = len(fps)
    end = time()
    print(f'Time for {nfps} fingerprints: ', end - start)

    def distij(i, j, fps=fps):
        return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

    picker = MaxMinPicker()
    start = time()
    pickIndices = picker.LazyPick(distij, nfps, 1000, seed=23)
    end = time()
    idces = list(pickIndices)
    print('Time for picker: ', end - start)

    m_selected = [ms[i] for i in idces]
    img = Draw.MolsToGridImage(m_selected)
    img
    img.save('diverse.png')
Exemplo n.º 11
0
def pick_centroids(
    mols: List[Chem.rdchem.Mol],
    npick: int = 0,
    initial_picks: List[int] = None,
    threshold: float = 0.5,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    method: str = "sphere",
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of `npick` centroids from a list of molecules.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        method: Picking method to use. One of  `sphere`, `maxmin` or any
            supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been selected as centroids
        mols: molecules that have been picked
    """

    n_mols = len(mols)
    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    initial_picks = [] if initial_picks is None else initial_picks

    if method == "maxmin":
        picker = MaxMinPicker()
        picked_inds, _ = picker.LazyPickWithThreshold(
            dist_fn,
            n_mols,
            pickSize=npick,
            threshold=threshold,
            firstPicks=initial_picks,
            seed=seed,
        )

    elif method == "sphere":
        picker = LeaderPicker()
        picked_inds = picker.LazyPick(dist_fn,
                                      n_mols,
                                      threshold=threshold,
                                      pickSize=npick,
                                      firstPicks=initial_picks)

    elif method.upper() in ClusterMethod.names.keys() and npick:
        if initial_picks:
            logger.warning(
                "Initial picks is not supported by hierarchical clustering. You pick has been discarded."
            )

        dist_mat = dm.parallelized(distij,
                                   list(
                                       zip(*np.tril_indices(len(mols), k=-1))),
                                   arg_type="args")
        dist_mat = np.asarray(dist_mat)
        picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()])
        picked_inds = picker.Pick(dist_mat, n_mols, npick)
    else:
        raise ValueError(
            f"Picking method {method} with {npick} elements to pick is not supported."
        )
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols
Exemplo n.º 12
0
                      reverse=True)
#print top ten matches
[
    print(Chem.MolToSmiles(m), m.GetProp('GENERIC_NAME'), sim)
    for m, sim in sorted_m__qs[:10]
]


### simple diversity picking
# define our way of getting distances between fps under given indices
def fp_distance(i, j, fingerprints=[fp for m, fp in molecule__fp]):
    return 1 - DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])


# instantiate the RDKit diversity picker
picker = MaxMinPicker()
# run the picker with the fp_distance function, get 10 diverse pick indices
pickIndices = picker.LazyPick(fp_distance, len(molecule__fp), 10, seed=42)
# grab the Molecule instances that have those picked indices
picks = [molecule__fp[i][0] for i in pickIndices]
[print(Chem.MolToSmiles(m), m.GetProp('GENERIC_NAME')) for m in picks]


## now inverse it for similarity picking
# redefine the function
def fp_distance(i, j, fingerprints=[fp for m, fp in molecule__fp]):
    return DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])


# run the picker again & print the picked molecules.
# They should be noticeably more similar than those in previous set
 def execute(self):
     """
     TODO
     """
     print()
     print("Loading input file with path: " + ZincPicker.input_file_path)
     zinc_for_sale_mol_supplier = Chem.SmilesMolSupplier(
         ZincPicker.input_file_path)
     num_none_mols = 0
     print("Output file path: " + ZincPicker.output_file_path)
     writer = Chem.SmilesWriter(ZincPicker.output_file_path)
     lower_index = 0
     upper_index = ZincPicker.pool_size
     print("Entering picking iterations...")
     print()
     for y in range(0, ZincPicker.num_iterations):
         print("Number of iteration: ", y)
         print("Lower index: ", lower_index)
         print("Upper index: ", upper_index)
         print("Loading molecules now...")
         molecules = []
         for x in range(lower_index, upper_index):
             mol = zinc_for_sale_mol_supplier[x]
             if mol is None:
                 num_none_mols += 1
                 continue
             molecules.append(mol)
         while molecules.count(None):
             molecules.remove(None)
         # radius 3
         print("Number of molecules loaded: ", len(molecules))
         print("Calculating fingerprints...")
         self.fingerprint_list = [
             GetMorganFingerprint(x, 3) for x in molecules
         ]
         nfps = len(self.fingerprint_list)
         print("Number of fingerprints: ", nfps)
         print("Now min-max picking ", ZincPicker.pick_size,
               " out of the finger print list...")
         picker = MaxMinPicker()
         pickIndices = picker.LazyPick(
             self.calculate_dice_similarity_distance,
             nfps,
             ZincPicker.pick_size,
             seed=23)
         print("Finished picking, writing to file...")
         for z in pickIndices:
             writer.write(molecules[z])
         # clear memory
         molecules = []
         self.fingerprint_list = []
         nfps = 0
         picker = None
         pickIndices = []
         # raise indices
         lower_index = lower_index + ZincPicker.pool_size
         upper_index = upper_index + ZincPicker.pool_size
         print("Finished this iteration, entering the next...")
         print()
     print("Execution successful.")
     print("Picked ",
           ZincPicker.pick_size * ZincPicker.num_iterations - num_none_mols,
           " out of ", ZincPicker.num_iterations * ZincPicker.pool_size,
           " molecules in ", ZincPicker.num_iterations,
           " iterations, while picking ", ZincPicker.pick_size,
           " in each iteration.")
Exemplo n.º 14
0
def main(repitition_number): 
    '''import'''
    from comet_ml import Experiment
    exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj",
                            project_name="iter_baseline", workspace="gdreiman1", disabled = False
                            )
    exp.log_code = True
    exp.log_other('Hypothesis','''Taking 5% batches, 80% of batch is the top ranked compounds, remaining 20% is diverse selection for first 5
                  iterations, then reverts to random sampling''')
    import pickle, sys, os
    from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
    import numpy as np
    from Iterative_help_funcs import get_Scaled_Data,train_SVM,train_DNN,train_RF,train_LGBM,calc_and_save_metrics,train_PyTorchGCNN
    from imblearn.over_sampling import RandomOverSampler
    #choosing a 3:1 Inactive to Active ratio
    ros = RandomOverSampler(sampling_strategy= 0.33)
    import pandas as pd
    from joblib import Parallel, delayed
    from joblib.externals.loky import set_loky_pickler
    from joblib import parallel_backend
    import tensorflow as tf
    tf.logging.set_verbosity(tf.logging.ERROR)
    
    #%% 
    '''Load data'''
    
    AID_list =['AID_1345083','AID_624255','AID_449739','AID_995','AID_938','AID_628','AID_596','AID_893','AID_894']
    #AID_list =['AID_893','AID_894']
    
    #This will hold a series of dicts of metrics that we then build into a dataframe
    metric_dict_list = []
    multi_dump_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', 'ranked_diverse_run'+str(repitition_number)+'.pkl') 
    exp.log_other('Metrics Dict Path',multi_dump_path)
    #using longest most imformantive embeddings
          
    classifier_dict = {'SVM': train_SVM, 'RF': train_RF, 'LGBM':train_LGBM,'DNN':train_DNN,'GCNN_pytorch':train_PyTorchGCNN}
    
    model_list = ['GCNN_pytorch','SVM','RF','LGBM','DNN']
    model_list = ['RF']

    num_models = len(model_list)
    mmp = MaxMinPicker()
    #define how we select after inital training run
    selection_type = 'Diverse'
    #define size of iter after first 10% train relative to that trainsize
    iterRel2Start = 0.5
    end_iter = 10
    for AID in AID_list:
            if 'win' in sys.platform:
                AID_path = os.path.join(r'C:\Users\gdrei\Dropbox\UCL\Thesis\Data', AID) 
            else:
                AID_path = os.path.join('/home/gabriel/Dropbox/UCL/Thesis/Data/', AID) 
            save_path = AID_path+ '/' + AID +'graph_processed.pkl'
            pickle_off = open(save_path,'rb')
            activity_table=pickle.load(pickle_off)
            pickle_off.close()
            
            '''Pick diverse starting point of 10% of library'''
            fplist = [x for x in activity_table['bit_MFP']]
            '''start_indexs holds the indexes of molecules already scanned at the
            start of each iteration. So for the first iter it hold the diversity selection. 
            For the second, it holds both the diversity selection and the molecules 
            screened based on the results of the first training iteration etc'''
            start_index_meta_list = []
            for i in range(rep_number):
                
                start_indexs = np.array(mmp.LazyBitVectorPick(fplist,len(fplist),int(len(fplist)/10)))
                '''store in a list that will vary as each model makes its predictions'''
                start_ind_list=[start_indexs for i in range(num_models)]
                index_name = "starting_sample_"+str(i)
                start_index_meta_dict[index_name] = start_ind_list
            diverse_size_list = [0 for i in range(num_models*)]
            fp_metalist = [fplist for i in range(num_models)]
            library_size = len(fplist)
            iter_num = 0 
            while iter_num < end_iter:
                print("Beginning Iteration ",iter_num)
                if iter_num < 5:
                    selection_type = 'Diverse'
                else:
                    selection_type = 'Random'
                        
                '''run thru models and get their preds for this iter'''
                for list_idx,[model_type,start_indexs] in enumerate(zip(model_list,start_ind_list)):
                    '''Get data for the starting molecules, it will be graphs for GCNN, else the MFP_MolChars'''
                    test_index = list(set(activity_table.index)-set(start_indexs))
                    #check that we haven't exceeded 50% of library
                    if len(test_index) > int(0.5 *library_size):
                        if model_type == 'GCNN_pytorch':
                            embedding_type = 'Graph'
                            X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type)
                        else:
                            embedding_type = 'MFPMolChars'                  
                            X_train,X_test,y_train,y_test = get_Scaled_Data(start_indexs,test_index,activity_table,True,embedding_type)
                        #oversample to 2:1 Inactive to Active
                        '''what happens when ratio is better than ros??? Bad boy errors!!
                        Now need another check to get out of our pkl with over enrichment (hahaha)'''
                        if (len(y_train)/sum(y_train))<0.25:
                        #need to ros iff ratio is less than 3:1
                            if model_type == 'GCNN_pytorch':
                                '''ros doesn't like the data apparently'''
                                over_X_train,over_y_train = ros.fit_resample(np.arange(len(X_train)).reshape((-1,1)),y_train)
                                over_X_train = over_X_train.reshape(-1)
                                over_X_train = [X_train[i] for i in over_X_train]
                            else:
                                over_X_train,over_y_train = ros.fit_resample(X_train,y_train)
                        else:
                        #just use current enriched sample
                                over_X_train,over_y_train = X_train,y_train
                        '''Inital train run'''
                        train_and_predict_model = classifier_dict[model_type]
                        #have this split here so that I can deal w fact that DNNs
                        #are now returning the history
                        if model_type =='DNN' or model_type=='GCNN_pytorch':
                            train_predicted_probs,test_predicted_probs,base_test_predicted_probs,hist = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train)
                        else:
                            train_predicted_probs,test_predicted_probs,base_test_predicted_probs = train_and_predict_model(over_X_train,X_test,over_y_train,y_test,X_train)                    
                            hist = None
                        metric_dict_list = calc_and_save_metrics(y_test,test_predicted_probs,model_type,
                                              embedding_type,AID,metric_dict_list,iter_num,'test',hist)
                        metric_dict_list = calc_and_save_metrics(over_y_train,train_predicted_probs,model_type,
                                              embedding_type,AID,metric_dict_list,iter_num,'train')
                        metric_dict_list = calc_and_save_metrics(y_train,base_test_predicted_probs,model_type,
                                              embedding_type,AID,metric_dict_list,iter_num,'base_train')
                        '''Now select next 5% section'''
                        '''Put labels and preds in df, sort them. take top 80% of tier size of the top predictions
                        then do a diverse selection or random selection for remaining 20% more'''
                        preds_df = pd.DataFrame({'activity_table_index':np.array(test_index),'prob_active':np.array(test_predicted_probs)},columns= np.array(['activity_table_index','prob_active']))
                        preds_df.sort_values('prob_active',ascending=False,inplace=True,axis=0)
                        next_inds=[]
                        top_to_select = int(len(activity_table)*0.04)
                        explore_select = int(len(activity_table)*0.01)
                        next_inds=next_inds+preds_df.head(top_to_select)['activity_table_index'].tolist()
                        firstPicksList = next_inds+(start_indexs.tolist())
                        start_ind_list[list_idx] = firstPicksList
                        diverse_size_list[list_idx] = explore_select
    
                def getRandomIterInds(firstPicksList,fplist,bottom_to_select):
                    full_list_index = np.arange(len(fplist))
                    unselected_inds = list(set(full_list_index) - set(firstPicksList))
                    random_selection = np.random.choice(unselected_inds,bottom_to_select,replace=False)
                    start_indexs = np.concatenate((firstPicksList,random_selection),axis=0)
                    return start_indexs
                def getNextIterInds(firstPicksList,fplist,bottom_to_select):
                    diverse_picks = mmp.LazyBitVectorPick(fplist,len(fplist),len(firstPicksList)+bottom_to_select,firstPicksList)
                    start_indexs = np.array(diverse_picks)
                    return start_indexs
                with parallel_backend('multiprocessing'):
                    if selection_type == 'Diverse':
                        start_ind_list = Parallel(n_jobs=5)(delayed(getNextIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list))
                    elif selection_type == 'Random':
                        start_ind_list = Parallel(n_jobs=5)(delayed(getRandomIterInds)(firstPicksList=i, fplist=j,bottom_to_select=k) for i,j,k in zip(start_ind_list,fp_metalist,diverse_size_list))
    
                metrics_df = pd.DataFrame(metric_dict_list)
                metrics_df.to_pickle(multi_dump_path)
                iter_num +=1
    exp.end()
Exemplo n.º 15
0
def main(prior_name, name, max_samples, diversity_picker, oracle, w_min):
    prior_model = model_from_json(prior_name)

    # We start by creating another prior instance, then replace it with the actual weights
    # name = search_vae
    search_model = model_from_json(prior_name)
    model_weights_path = os.path.join(script_dir, 'results', name,
                                      'weights.pth')
    search_model.load(model_weights_path)

    samples, weights = get_samples(prior_model,
                                   search_model,
                                   max=max_samples,
                                   w_min=w_min)

    # if diversity picker < max_samples, we subsample with rdkit picker :
    if 0 < diversity_picker < max_samples:
        mols = [Chem.MolFromSmiles(s) for s in samples]
        fps = [GetMorganFingerprint(x, 3) for x in mols]
        picker = MaxMinPicker()

        def distij(i, j, fps=fps):
            return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

        pickIndices = picker.LazyPick(distij, max_samples, diversity_picker)
        idces = list(pickIndices)
        samples = [samples[i] for i in idces]
        weights = [weights[i] for i in idces]

    # Since we don't maintain a dict for qed, we just give everything to the docker
    if oracle != 'docking' or True:
        dump_path = os.path.join(script_dir, 'results', name,
                                 'docker_samples.p')
        pickle.dump(samples, open(dump_path, 'wb'))

        # Dump for the trainer
        dump_path = os.path.join(script_dir, 'results', name, 'samples.p')
        pickle.dump((samples, weights), open(dump_path, 'wb'))

    else:
        # Memoization, we split the list into already docked ones and dump a simili-docking csv
        whole_path = os.path.join(script_dir, '..', 'data',
                                  'drd3_scores.pickle')
        docking_whole_results = pickle.load(open(whole_path, 'rb'))
        filtered_smiles = list()
        already_smiles = list()
        already_scores = list()
        for i, smile in enumerate(samples):
            if smile in docking_whole_results:
                already_smiles.append(smile)
                already_scores.append(docking_whole_results[smile])
            else:
                filtered_smiles.append(smile)

        # Dump simili-docking
        dump_path = os.path.join(script_dir, 'results', name,
                                 'docking_small_results', 'simili.csv')
        df = pd.DataFrame.from_dict({
            'smile': already_smiles,
            'score': already_scores
        })
        df.to_csv(dump_path)

        # Dump for the docker
        dump_path = os.path.join(script_dir, 'results', name,
                                 'docker_samples.p')
        pickle.dump(filtered_smiles, open(dump_path, 'wb'))

        # Dump for the trainer
        dump_path = os.path.join(script_dir, 'results', name, 'samples.p')
        pickle.dump((samples, weights), open(dump_path, 'wb'))