Exemplo n.º 1
0
def get_rdkit_smiles_parent(data):
    print("")

    print(
        "Adding SMILES column 'rdkit_smiles_parent' with salts stripped...(may take a while)",
        flush=True)
    """ ___Strip the salts off the rdkit SMILES strings___
        First, loops through data and determines the base/parent smiles string for each row.
        Appends the base smiles string to a new row in a list.
        Then adds the list as a new column in 'data'"
        """

    i_max = data.shape[0]
    rdkit_smiles_parent = []
    for i in range(i_max):
        smile = data['rdkit_smiles'].iloc[i]
        if type(smile) is float:
            split = ''
        else:
            split = base_smiles_from_smiles(smile)

        rdkit_smiles_parent.append(split)

    #  2. Add base smiles string (stripped smiles) to dataset
    data['rdkit_smiles_parent'] = rdkit_smiles_parent

    return data
Exemplo n.º 2
0
def get_rdkit_smiles_parent(data):
    """Strip the salts off the rdkit SMILES strings

    First, loops through data and determines the base/parent smiles string for each row.
    Appends the base smiles string to a new row in a list.
    Then adds the list as a new column, 'rdkit_smiles_parent', in 'data'.
    Basically calls base_smiles_from_smiles for each smile in the column 'rdkit_smiles'

    Args:
        data (DataFrame): A DataFrame with a column named 'rdkit_smiles'.

    Returns:
        DataFrame with column 'rdkit_smiles_parent' with salts stripped
    """
    print("")
    print(
        "Adding SMILES column 'rdkit_smiles_parent' with salts stripped...(may take a while)",
        flush=True)

    i_max = data.shape[0]
    rdkit_smiles_parent = []
    for i in range(i_max):
        smile = data['rdkit_smiles'].iloc[i]
        if type(smile) is float:
            split = ''
        else:
            split = base_smiles_from_smiles(smile)

        rdkit_smiles_parent.append(split)

    #  2. Add base smiles string (stripped smiles) to dataset
    data['rdkit_smiles_parent'] = rdkit_smiles_parent

    return data
Exemplo n.º 3
0
def _prepare_input_data(input_df, id_col, smiles_col, response_col, conc_col,
                        dont_standardize):
    """
    Prepare input data frame for running predictions
    """
    colnames = set(input_df.columns.values)
    if (id_col is None) or (id_col not in colnames):
        input_df['compound_id'] = [
            'compound_%.6d' % i for i in range(input_df.shape[0])
        ]
        id_col = 'compound_id'
    if smiles_col not in colnames:
        raise ValueError(
            'smiles_col parameter not specified or column not in input file.')
    if dont_standardize:
        std_smiles_col = smiles_col
    else:
        print("Standardizing SMILES strings for %d compounds." %
              input_df.shape[0])
        orig_ncmpds = input_df.shape[0]
        std_smiles = base_smiles_from_smiles(
            input_df[smiles_col].values.tolist(), workers=16)
        input_df['orig_smiles'] = input_df[smiles_col]
        input_df[smiles_col] = std_smiles
        input_df = input_df[input_df[smiles_col] != '']
        if input_df.shape[0] == 0:
            raise ValueError("No valid SMILES strings to predict on.")
        nlost = orig_ncmpds - input_df.shape[0]
        if nlost > 0:
            print(
                "Could not parse %d SMILES strings; will predict on the remainder."
                % nlost)

    pred_params = {
        'featurizer': 'computed_descriptors',
        'result_dir': tempfile.mkdtemp(),
        'id_col': id_col,
        'smiles_col': smiles_col
    }
    if (response_col is not None) and (response_col
                                       in input_df.columns.values):
        pred_params['response_cols'] = response_col
        if conc_col is not None and conc_col in input_df.columns.values:
            pred_params['response_cols'] += "," + conc_col
    elif conc_col is not None and conc_col in input_df.columns.values:
        pred_params['response_cols'] = "ACTIVITY," + conc_col

    return input_df, pred_params
def mcs_vs_tanimoto(pred_dset, pred_smiles_col='smiles'):
    """
    Compute within-dataset distance matrices for compounds in pred_dset based on both Tanimoto and MCS
    distances, and compare the resulting distances.

    """
    if type(pred_dset) == str:
        pred_df = pd.read_csv(pred_dset, index_col=False)
    else:
        pred_df = pred_dset
    pred_smiles = pred_df[pred_smiles_col].values
    pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles]
    cmpd_ids = pred_df.compound_id.values
    ncmpd = pred_df.shape[0]

    cmpd_i_list = []
    cmpd_j_list = []
    tani_dist = []
    mcs_dist = []

    tani_dist_mat = cd.calc_dist_smiles('ecfp',
                                        'tanimoto',
                                        pred_smiles,
                                        calc_type='all')
    mcs_dist_mat = cd.calc_dist_smiles('ecfp',
                                       'mcs',
                                       pred_smiles,
                                       calc_type='all')
    for i in range(ncmpd - 1):
        cmpd_i = cmpd_ids[i]
        for j in range(i + 1, ncmpd):
            cmpd_j = cmpd_ids[j]
            cmpd_i_list.append(cmpd_i)
            cmpd_j_list.append(cmpd_j)
            tani_dist.append(tani_dist_mat[i, j])
            mcs_dist.append(mcs_dist_mat[i, j])

    dist_df = pd.DataFrame(
        dict(compound_i=cmpd_i_list,
             compound_j=cmpd_j_list,
             tanimoto_distance=tani_dist,
             mcs_distance=mcs_dist))
    fig, ax = plt.subplots(figsize=(15, 15))
    sns.scatterplot(x='mcs_distance', y='tanimoto_distance', data=dist_df)

    return dist_df
def nearest_neighbor_distances(pred_dset,
                               ref_dset,
                               pred_smiles_col='smiles',
                               ref_smiles_col='base_rdkit_smiles'):
    """
    Find the nearest neighbor compound in the reference dataset for each predicted compound and its distance to
    the predicted compound. Add this information to the table of predicted properties.
    """

    if type(pred_dset) == str:
        pred_df = pd.read_csv(pred_dset, index_col=False)
    else:
        pred_df = pred_dset
    if type(ref_dset) == str:
        ref_df = pd.read_csv(ref_dset, index_col=False)
    else:
        ref_df = ref_dset
    pred_smiles = pred_df[pred_smiles_col].values
    pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles]
    ref_smiles = ref_df[ref_smiles_col].values

    dist_arr = cd.calc_dist_smiles('ecfp',
                                   'tanimoto',
                                   pred_smiles,
                                   ref_smiles,
                                   calc_type='all')
    ref_cmpd_ids = ref_df.compound_id.values

    nn_ind = np.argmin(dist_arr, axis=1)
    nn_dist = np.min(dist_arr, axis=1)
    pred_df['nearest_cmpd'] = ref_cmpd_ids[nn_ind]
    pred_df['nearest_dist'] = nn_dist
    uniq_neighbors, counts = np.unique(pred_df.nearest_cmpd.values,
                                       return_counts=True)
    nnfreq_df = pd.DataFrame(
        dict(nearest_cmpd=uniq_neighbors,
             pred_cmpd_count=counts)).sort_values(by='pred_cmpd_count',
                                                  ascending=False)
    nn_pred_df = pred_df.merge(nnfreq_df, how='left',
                               on='nearest_cmpd').sort_values(
                                   by=['pred_cmpd_count', 'nearest_cmpd'],
                                   ascending=False)
    return nn_pred_df
def compute_dist_matrix(pred_file,
                        ref_dset_file,
                        pred_smiles_col='smiles',
                        ref_smiles_col='base_rdkit_smiles'):
    """
    Compute the Tanimoto distance matrix between the SMILES strings in pred_file and those in ref_dset_file.
    """
    base = os.path.splitext(os.path.basename(pred_file))

    pred_df = pd.read_csv(pred_file, index_col=False)
    ref_df = pd.read_csv(ref_dset_file, index_col=False)
    pred_smiles = pred_df[pred_smiles_col].values
    pred_smiles = [base_smiles_from_smiles(s) for s in pred_smiles]
    ref_smiles = ref_df[ref_smiles_col].values

    dist_arr = cd.calc_dist_smiles('ecfp',
                                   'tanimoto',
                                   pred_smiles,
                                   ref_smiles,
                                   calc_type='all')
    return dist_arr
Exemplo n.º 7
0
def aggregate_assay_data(assay_df,
                         value_col='VALUE_NUM',
                         output_value_col=None,
                         label_actives=True,
                         active_thresh=None,
                         id_col='CMPD_NUMBER',
                         smiles_col='rdkit_smiles',
                         relation_col='VALUE_FLAG',
                         date_col=None):
    """
    Map RDKit SMILES strings in assay_df to base structures, then compute an MLE estimate of the mean value over replicate measurements
    for the same SMILES strings, taking censoring into account. Generate an aggregated result table with one value for each unique base 
    SMILES string, to be used in an ML-ready dataset.

    :param assay_df: The input data frame to be processed.
    :param value_col: The column in the data frame containing assay values to be averaged.
    :param output_value_col: Optional; the column name to use in the output data frame for the averaged data.
    :param label_actives: If True, generate an additional column 'active' indicating whether the mean value is above a threshold specified by active_thresh.
    :param active_thresh: The threshold to be used for labeling compounds as active or inactive.
    If active_thresh is None (the default), the threshold used is the minimum reported value across all records
    with left-censored values (i.e., those with '<' in the relation column.
    :param id_col: The input data frame column containing compound IDs.
    :param smiles_col: The input data frame column containing SMILES strings.
    :param relation_col: The input data frame column containing relational operators (<, >, etc.).
    :param date_col: The input data frame column containing dates when the assay data was uploaded. If not None, the code will assign the earliest
    date among replicates to the aggregate data record.
    :return: A data frame containing averaged assay values, with one value per compound.
    """

    assay_df = assay_df.fillna({relation_col: '', smiles_col: ''})
    # Filter out rows where SMILES is missing
    n_missing_smiles = np.array(
        [len(smiles) == 0 for smiles in assay_df[smiles_col].values]).sum()
    print("%d entries in input table are missing SMILES strings" %
          n_missing_smiles)
    has_smiles = np.array(
        [len(smiles) > 0 for smiles in assay_df[smiles_col].values])
    assay_df = assay_df[has_smiles].copy()

    # Estimate the measurement error across replicates for this assay
    std_est = replicate_rmsd(assay_df,
                             smiles_col=smiles_col,
                             value_col=value_col,
                             relation_col=relation_col)

    # Map SMILES strings to base structure SMILES strings, then map these to indices into the list of
    # unique base structures
    orig_smiles_strs = assay_df[smiles_col].values
    norig = len(set(orig_smiles_strs))
    smiles_strs = [
        base_smiles_from_smiles(smiles, True) for smiles in orig_smiles_strs
    ]
    assay_df['base_rdkit_smiles'] = smiles_strs
    uniq_smiles_strs = list(set(smiles_strs))
    nuniq = len(uniq_smiles_strs)
    print(
        "%d unique SMILES strings are reduced to %d unique base SMILES strings"
        % (norig, nuniq))
    smiles_map = dict([(smiles, i)
                       for i, smiles in enumerate(uniq_smiles_strs)])
    smiles_indices = np.array(
        [smiles_map.get(smiles, nuniq) for smiles in smiles_strs])

    assay_vals = assay_df[value_col].values
    value_flags = assay_df[relation_col].values

    # Compute a maximum likelihood estimate of the mean assay value for each compound, averaging over replicates
    # and factoring in censoring. Report the censoring/relation/value_flag only if the flags are consistent across
    # all replicates.  # Exclude compounds that couldn't be mapped to SMILES strings.

    cmpd_ids = assay_df[id_col].values
    reported_cmpd_ids = [''] * nuniq
    reported_value_flags = [''] * nuniq
    if date_col is not None:
        reported_dates = [''] * nuniq
    reported_assay_val = np.zeros(nuniq, dtype=float)
    for i in range(nuniq):
        cmpd_ind = np.where(smiles_indices == i)[0]
        cmpd_df = assay_df.iloc[cmpd_ind]
        reported_assay_val[i], reported_value_flags[i] = mle_censored_mean(
            cmpd_df, std_est, value_col=value_col, relation_col=relation_col)
        # When multiple compound IDs map to the same base SMILES string, use the lexicographically smallest one.
        reported_cmpd_ids[i] = sorted(set(cmpd_ids[cmpd_ind]))[0]

        # If a date column is specified, use the earliest one among replicates
        if date_col is not None:
            # np.datetime64 doesn't seem to understand the date format in GSK's crit res tables
            #earliest_date = sorted([np.datetime64(d) for d in cmpd_df[date_col].values])[0]
            earliest_date = sorted(
                pd.to_datetime(cmpd_df[date_col],
                               infer_datetime_format=True).values)[0]
            reported_dates[i] = np.datetime_as_string(earliest_date)

    if output_value_col is None:
        output_value_col = value_col
    agg_df = pd.DataFrame({
        'compound_id': reported_cmpd_ids,
        'base_rdkit_smiles': uniq_smiles_strs,
        'relation': reported_value_flags,
        output_value_col: reported_assay_val
    })

    if date_col is not None:
        agg_df[date_col] = reported_dates

    # Label each compound as active or not, based on the reported relation and values relative to a common threshold
    if label_actives:
        inactive_df = agg_df[agg_df.relation == '<']
        if inactive_df.shape[0] > 0 and active_thresh is None:
            active_thresh = np.min(inactive_df[output_value_col].values)
        if active_thresh is not None:
            is_active = ((agg_df.relation != '<') &
                         (agg_df[output_value_col].values > active_thresh))
            agg_df['active'] = [int(a) for a in is_active]
        else:
            agg_df['active'] = 1

    return agg_df
Exemplo n.º 8
0
def predict_activity(args):

    input_df = pd.read_csv(args.input_file, index_col=False)
    colnames = set(input_df.columns.values)
    if args.id_col not in colnames:
        input_df['compound_id'] = [
            'compound_%.6d' % i for i in range(input_df.shape[0])
        ]
        args.id_col = 'compound_id'
    if args.smiles_col not in colnames:
        raise ValueError(
            'smiles_col parameter not specified or column not in input file.')
    if args.dont_standardize:
        std_smiles_col = args.smiles_col
    else:
        print("Standardizing SMILES strings for %d compounds." %
              input_df.shape[0])
        orig_ncmpds = input_df.shape[0]
        std_smiles = [
            base_smiles_from_smiles(s)
            for s in input_df[args.smiles_col].values
        ]
        input_df['standardized_smiles'] = std_smiles
        input_df = input_df[input_df.standardized_smiles != '']
        if input_df.shape[0] == 0:
            print("No valid SMILES strings to predict on.")
            return
        nlost = orig_ncmpds - input_df.shape[0]
        input_df = input_df.sort_values(by=args.id_col)
        orig_smiles = input_df[args.smiles_col].values
        if nlost > 0:
            print(
                "Could not parse %d SMILES strings; will predict on the remainder."
                % nlost)
        std_smiles_col = 'standardized_smiles'

    pred_params = {'id_col': args.id_col, 'smiles_col': std_smiles_col}
    has_activity = (args.activity_col is not None)
    if has_activity:
        pred_params['response_cols'] = args.activity_col
    pred_params = parse.wrapper(pred_params)

    model_files = dict(random='bsep_classif_random_split.tar.gz',
                       scaffold='bsep_classif_scaffold_split.tar.gz')
    if args.model_type not in model_files:
        raise ValueError("model_type %s is not a recognizied value." %
                         args.model_type)

    # Test loading model from tarball and running predictions
    models_dir = os.path.join(os.path.dirname(os.path.dirname(mp.__file__)),
                              'examples', 'BSEP', 'models')
    model_tarfile = os.path.join(models_dir, model_files[args.model_type])
    pipe = mp.create_prediction_pipeline_from_file(pred_params,
                                                   reload_dir=None,
                                                   model_path=model_tarfile)
    pred_df = pipe.predict_full_dataset(input_df,
                                        contains_responses=has_activity,
                                        dset_params=pred_params)
    pred_df = pred_df.sort_values(by=args.id_col)
    if not args.dont_standardize:
        pred_df[args.smiles_col] = orig_smiles

    # Write predictions to output file
    pred_df.to_csv(args.output_file, index=False)
    print("Wrote predictions to file %s" % args.output_file)

    # If measured activity values are provided, print some performance metrics
    if has_activity:
        actual_vals = pred_df['%s_actual' % args.activity_col].values
        pred_classes = pred_df['%s_pred' % args.activity_col].values
        pred_probs = pred_df['%s_prob' % args.activity_col].values
        conf_matrix = metrics.confusion_matrix(actual_vals, pred_classes)
        roc_auc = metrics.roc_auc_score(actual_vals, pred_probs)
        prc_auc = metrics.average_precision_score(actual_vals, pred_probs)
        accuracy = metrics.accuracy_score(actual_vals, pred_classes)
        precision = metrics.precision_score(actual_vals, pred_classes)
        npv = negative_predictive_value(actual_vals, pred_classes)
        recall = metrics.recall_score(actual_vals, pred_classes)
        mcc = metrics.matthews_corrcoef(actual_vals, pred_classes)
        ncorrect = sum(actual_vals == pred_classes)
        print("Performance metrics:\n")
        print("%d out of %d predictions correct." %
              (ncorrect, pred_df.shape[0]))
        print("Accuracy: %.3f" % accuracy)
        print("Precision: %.3f" % precision)
        print("Recall: %.3f" % recall)
        print("NPV: %.3f" % npv)
        print("ROC AUC: %.3f" % roc_auc)
        print("PRC AUC: %.3f" % prc_auc)
        print("Matthews correlation coefficient: %.3f" % mcc)
        print("Confusion matrix:")
        print("\t\tpredicted activity")
        print("actual\nactivity\t0\t1\n")
        print("   0\t\t%d\t%d" % (conf_matrix[0][0], conf_matrix[0][1]))
        print("   1\t\t%d\t%d" % (conf_matrix[1][0], conf_matrix[1][1]))