Пример #1
0
def main(unused_argv):
    """Main function that extracts the contineous data-driven descriptors for a file of SMILES."""
    if FLAGS.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.device)
    model_dir = FLAGS.model_dir
    file = FLAGS.input
    df = read_input(file)
    if FLAGS.preprocess:
        print("start preprocessing SMILES...")
        df["new_smiles"] = df[FLAGS.smiles_header].map(preprocess_smiles)
        sml_list = df[~df.new_smiles.isna()].new_smiles.tolist()
        print("finished preprocessing SMILES!")
    else:
        sml_list = df[FLAGS.smiles_header].tolist()
    print("start calculating descriptors...")
    infer_model = InferenceModel(model_dir=model_dir,
                                 use_gpu=FLAGS.gpu,
                                 batch_size=FLAGS.batch_size,
                                 cpu_threads=FLAGS.cpu_threads)
    descriptors = infer_model.seq_to_emb(sml_list)
    print(
        "finished calculating descriptors! %d out of %d input SMILES could be interpreted"
        % (len(sml_list), len(df)))
    if FLAGS.preprocess:
        df = df.join(
            pd.DataFrame(descriptors,
                         index=df[~df.new_smiles.isna()].index,
                         columns=["cddd_" + str(i + 1) for i in range(512)]))
    else:
        df = df.join(
            pd.DataFrame(descriptors,
                         index=df.index,
                         columns=["cddd_" + str(i + 1) for i in range(512)]))
    print("writing descriptors to file...")
    df.to_csv(FLAGS.output)
Пример #2
0
 def __init__(self, use_gpu=True, cpu_threads=5, model_dir=None, **kwargs):
     self.name = __class__.__name__.split('.')[-1]
     self.kwargs = TransformationDefaults[self.name].value
     self.kwargs.update(kwargs)
     model_dir = download_cddd_models()
     self.func = InferenceModel(model_dir,
                                use_gpu=use_gpu,
                                cpu_threads=cpu_threads)
Пример #3
0
def main(unused_argv):
    """Main function to test the performance of the translation model to extract
    meaningfull features for a QSAR modelling"""
    if FLAGS.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.device)
        print("use gpu {}".format(str(FLAGS.device)))
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    model_dir = FLAGS.model_dir

    infer_model = InferenceModel(model_dir,
                                 use_gpu=FLAGS.gpu,
                                 cpu_threads=FLAGS.cpu_threads)
    ames_df = pd.read_csv("ames.csv")
    ames_smls = ames_df.smiles.tolist()
    ames_labels = ames_df.label.values
    ames_fold = ames_df.fold.values
    print("Extracting molecular desscriptors for Ames")
    ames_emb = infer_model.seq_to_emb(ames_smls)
    ames_emb = (ames_emb - ames_emb.mean()) / ames_emb.std()

    lipo_df = pd.read_csv("lipo.csv")
    lipo_smls = lipo_df.smiles.tolist()
    lipo_labels = lipo_df.label.values
    lipo_fold = lipo_df.fold.values
    print("Extracting molecular desscriptors for Lipophilicity")
    lipo_emb = infer_model.seq_to_emb(lipo_smls)
    lipo_emb = (lipo_emb - lipo_emb.mean()) / lipo_emb.std()

    print("Running SVM on Ames mutagenicity...")
    clf = SVC(C=5.0)
    result = cross_val_score(clf,
                             ames_emb,
                             ames_labels,
                             ames_fold,
                             cv=LeaveOneGroupOut(),
                             n_jobs=5)
    print("Ames mutagenicity accuracy: %0.3f +/- %0.3f" %
          (np.mean(result), np.std(result)))

    print("Running SVM on Lipophilicity...")
    clf = SVR(C=5.0)
    result = cross_val_score(clf,
                             lipo_emb,
                             lipo_labels,
                             lipo_fold,
                             cv=LeaveOneGroupOut(),
                             n_jobs=5)
    print("Lipophilicity r2: %0.3f +/- %0.3f" %
          (np.mean(result), np.std(result)))
Пример #4
0
def main(args):
    # Initialize molscore
    ms = MolScore(model_name='mso', task_config=args.molscore_config)
    ms.log_parameters(vars(args))

    # Load inference model
    infer_model = InferenceModel(model_dir=args.infer_model)

    # Load init smiles
    assert os.path.exists(args.smiles_file)
    with open(args.smiles_file, 'r') as f:
        init_smiles = f.read().splitlines()

    # Initialize Optimizer
    opt = BasePSOptimizer.from_query(init_smiles=init_smiles,
                                     num_part=args.num_part,
                                     num_swarms=args.num_swarm,
                                     inference_model=infer_model,
                                     scoring_function=ms)

    # Run optimizer
    opt.run(args.num_steps)

    # Save everything
    ms.write_scores()
    ms.kill_dash_monitor()
Пример #5
0
class Embeddings(BaseTransformation):
    def __init__(self, use_gpu=True, cpu_threads=5, model_dir=None, **kwargs):
        self.name = __class__.__name__.split('.')[-1]
        self.kwargs = TransformationDefaults[self.name].value
        self.kwargs.update(kwargs)
        model_dir = download_cddd_models()
        self.func = InferenceModel(model_dir,
                                   use_gpu=use_gpu,
                                   cpu_threads=cpu_threads)

    def transform(self, data):
        data = data['transformed_smiles']
        return self.func.seq_to_emb(data).squeeze()

    def inverse_transform(self, embeddings):
        "Embedding array -- individual compound embeddings are in rows"
        embeddings = np.asarray(embeddings)
        return self.func.emb_to_seq(embeddings)

    def __len__(self):
        return self.func.hparams.emb_size
Пример #6
0
def getemb_from_smi(smiles_nonzero_list, smiles_zero_list, nonzero_id):
	"""
	    compute encoding of smiles
	    Args:
			smiles_nonzero_list, smiles_zero_list: same as above
		Returns:
			x: smile embeddings
			y: fragment labels
    """
	inference_model = InferenceModel()

	smiles_emb_nonzero = inference_model.seq_to_emb(smiles_nonzero_list)
	smiles_emb_zero = inference_model.seq_to_emb(smiles_zero_list)

	dfsmi_nonzero_eb = pd.DataFrame(smiles_emb_nonzero)
	dfsmi_zero_eb = pd.DataFrame(smiles_emb_zero)
	dfsmi_eb = pd.concat([dfsmi_nonzero_eb, dfsmi_zero_eb])

	x = dfsmi_eb.to_numpy()
	count=x.shape[0]
	y = np.concatenate([np.ones(nonzero_id), np.zeros(count-nonzero_id)])

	return x, y, dfsmi_eb
Пример #7
0
def optimize(chid,
             n_estimators,
             n_jobs,
             external_file,
             n_external,
             seed,
             opt_name,
             optimizer_args,
             log_base):
    """
    Args:
        - chid: which assay to use
        - n_estimators: how many trees to use in Random Forest
        - n_jobs: how many parallel processes to use
        - external_file: Smiles that are not used for optimization
        - n_external: on how many such independent random points to calculate scores
        - seed: which random seed to use
        - opt_name: which optimizer to use (graph_ga or lstm_hc)
        - optimizer_args: dictionary with arguments for the optimizer
        - log_base: Where to store results. Will be appended by timestamp
    """
    config = locals()

    # Results might not be fully reproducible when using pytorch
    # https://pytorch.org/docs/stable/notes/randomness.html
    np.random.seed(seed)
    torch.manual_seed(seed)

    # set up logging
    results_dir = os.path.join(log_base, opt_name, chid, timestamp())
    os.makedirs(results_dir)

    config_file = os.path.join(results_dir, 'config.json')
    with open(config_file, 'w') as f:
        json.dump(config, f)

    clfs, aucs, balance, df1, df2 = fit_clfs(chid, n_estimators, n_jobs)
    results = {}
    results['AUC'] = aucs
    results['balance'] = balance

    clf_file = os.path.join(results_dir, 'classifiers.p')
    with open(clf_file, 'wb') as f:
        pickle.dump(clfs, f)

    df1.to_csv(os.path.join(results_dir, 'split1.csv'), index=False)
    df2.to_csv(os.path.join(results_dir, 'split2.csv'), index=False)

    # Create guacamol scoring function with clf trained on split 1
    scoring_function = TPScoringFunction(clfs['Split1'])


    infer_model = InferenceModel(model_dir='default_model') # The CDDD inference model used to encode/decode molecular SMILES strings to/from the CDDD space. You might need to specify the path to the pretrained model (e.g. default_model)



    mso_score = [ScoringFunction(func=scoring_function.raw_score_list, name='score', is_mol_func=False, is_smiles_func=True)]

    # from mso.objectives.mol_functions import qed_score
    # mso_score = [ScoringFunction(func=qed_score,            name="qed", is_mol_func=True)] # wrap the drug likeness score inside a scoring function instance

    class MsoWrapper(object):
        def __init__(self, smi_file, num_part, num_iter):
            self.smi_file = smi_file
            self.num_part = num_part
            self.num_iter = num_iter
            with open(self.smi_file) as f:
                self.start_pool = f.read().split()

        def run(self):
            init_smiles = list(np.random.choice(self.start_pool, self.num_part))

            print(type(init_smiles))

            opt = BasePSOptimizer.from_query(
                init_smiles=init_smiles,
                num_part=200,
                num_swarms=1,
                inference_model=infer_model,
                scoring_functions=mso_score)

            _, smiles_history = opt.run(self.num_iter)
            return smiles_history

    # run optimization
    t0 = time()
    if opt_name == 'graph_ga':
        optimizer = GB_GA_Generator(**optimizer_args)
    elif opt_name == 'lstm_hc':
        optimizer = SmilesRnnDirectedGenerator(**optimizer_args)
    elif opt_name == 'mso':
        optimizer = MsoWrapper(**optimizer_args)
    else:
        raise ValueError(f'Invalid optimizer: {opt_name}')

    if opt_name == 'mso':
        smiles_history = optimizer.run()
    else:
        smiles_history = optimizer.generate_optimized_molecules(
            scoring_function, 100, get_history=True)

    smiles_history = [can_list(e) for e in smiles_history]

    t1 = time()
    opt_time = t1 - t0

    # make a list of dictionaries for every time step
    # this is far from an optimal data structure
    statistics = []
    for optimized_smiles in smiles_history:
        row = {}
        row['smiles'] = optimized_smiles
        row['preds'] = {}
        for k, clf in clfs.items():
            preds = score(optimized_smiles, clf)
            if None in preds:
                print('Invalid score. Debug message')
            row['preds'][k] = preds
        statistics.append(row)

    results['statistics'] = statistics

    stat_time = time() - t1
    # add predictions on external set
    # load external smiles for evaluation
    with open(external_file) as f:
        external_smiles = f.read().split()
    external_smiles = np.random.choice(external_smiles, n_external)
    results['predictions_external'] = {k: score(external_smiles, clf) for k, clf in clfs.items()}

    results_file = os.path.join(results_dir, 'results.json')
    with open(results_file, 'w') as f:
        json.dump(results, f)

    print(f'Storing results in {results_dir}')
    print(f'Optimization time {opt_time:.2f}')
    print(f'Statistics time {stat_time:.2f}')
Пример #8
0
def cv(dataset, summary_df, cddd_model_dir, molbert_model_dir):
    df, indices = get_data(dataset)
    cddd = InferenceModel(cddd_model_dir)  # type: ignore
    molbert = MolBertFeaturizer(molbert_model_dir,
                                embedding_type='average-1-cat-pooled',
                                max_seq_len=200,
                                device='cpu')  # type: ignore
    ecfp = MorganFPFeaturizer(fp_size=2048,
                              radius=2,
                              use_counts=True,
                              use_features=False)
    rdkit_norm = PhysChemFeaturizer(normalise=True)

    cddd_fn = lambda smiles: cddd.seq_to_emb(smiles)
    molbert_fn = lambda smiles: molbert.transform(smiles)[0]
    ecfp_fn = lambda smiles: ecfp.transform(smiles)[0]
    rdkit_norm_fn = lambda smiles: rdkit_norm.transform(smiles)[0]

    for i, (train_idx, valid_idx, test_idx) in enumerate(indices):
        train_df = df.iloc[train_idx]
        valid_df = df.iloc[valid_idx]

        # combine train and valid set as SVMs don't use a validation set, but NNs do.
        # this way they use the same amount of data.
        train_df = pd.concat([train_df, valid_df])
        test_df = df.iloc[test_idx]

        fn_combos = [('cddd', cddd_fn), ('molbert', molbert_fn),
                     ('ECFP4', ecfp_fn), ('rdkit_norm', rdkit_norm_fn)]

        for feat_name, feat_fn in fn_combos:
            train_features = np.vstack([
                feat_fn(batch) for batch in batchify(train_df['SMILES'], 256)
            ])
            train_labels = train_df[df.columns[-1]]

            test_features = np.vstack(
                [feat_fn(batch) for batch in batchify(test_df['SMILES'], 256)])
            test_labels = test_df[df.columns[-1]]

            mode = summary_df[summary_df['task_name'] ==
                              dataset].iloc[0]['task_type'].strip()

            np.random.seed(i)
            if mode == 'regression':
                model = SVR(C=5.0)
            elif mode == 'classification':
                model = SVC(5.0, probability=True)
            else:
                raise ValueError(
                    f'Mode has to be either classification or regression but was {mode}.'
                )

            model.fit(train_features, train_labels)

            predictions = model.predict(test_features)

            if mode == 'classification':
                # predict probabilities (needed for some metrics) and get probs of positive class ([:, 1])
                prob_predictions = model.predict_proba(test_features)[:, 1]
                metrics_dict = {
                    'AUROC':
                    lambda: metrics.roc_auc_score(test_labels, prob_predictions
                                                  ),
                    'AveragePrecision':
                    lambda: metrics.average_precision_score(
                        test_labels, prob_predictions),
                    'Accuracy':
                    lambda: metrics.accuracy_score(test_labels, predictions),
                }
            else:
                metrics_dict = {
                    'MAE':
                    lambda: metrics.mean_absolute_error(
                        test_labels, predictions),
                    'RMSE':
                    lambda: np.sqrt(
                        metrics.mean_squared_error(test_labels, predictions)),
                    'MSE':
                    lambda: metrics.mean_squared_error(test_labels, predictions
                                                       ),
                    'R2':
                    lambda: metrics.r2_score(test_labels, predictions),
                }

            metric_values = {}
            for name, callable_metric in metrics_dict.items():
                try:
                    metric_values[name] = callable_metric()
                except Exception as e:
                    print(f'unable to calculate {name} metric')
                    print(e)
                    metric_values[name] = np.nan

            default_path = os.path.join(
                './logs/',
                datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ'))
            output_dir = os.path.join(default_path, dataset, str(i))
            os.makedirs(output_dir, exist_ok=True)
            with open(os.path.join(output_dir, f'{feat_name}_metrics.json'),
                      'w+') as fp:
                json.dump(metric_values, fp)