Python PandasTools示例，rdkit.Chem.PandasTools Python示例

示例#1

0

显示文件

 def test_all_numeric_with_numeric_columns(self):
     sio = StringIO()
     df = self.df
     df["len"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertEqual(s.count("<len>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)

示例#2

0

显示文件

文件： ace_workflow.py 项目： r-cloke/deepchem_ace

def make_input():
    active_df = pd.read_csv("actives_final.ism", header=None, sep=" ")
    active_rows, active_cols = active_df.shape
    active_df.columns = ["SMILES", "ID", "ChEMBL_ID"]
    active_df["label"] = ["Active"] * active_rows
    PandasTools.AddMoleculeColumnToFrame(active_df, "SMILES", "MOL")

    decoy_df = pd.read_csv("decoys_final.ism", header=None, sep=" ")
    decoy_rows, decoy_cols = decoy_df.shape
    decoy_df.columns = ["SMILES", "ID"]
    decoy_df["label"] = ["Decoy"] * decoy_rows
    PandasTools.AddMoleculeColumnToFrame(decoy_df, "SMILES", "MOL")

    active_df["is_active"] = [1] * active_df.shape[0]
    decoy_df["is_active"] = [0] * decoy_df.shape[0]
    combined_df = active_df.append(decoy_df)[["SMILES", "ID", "is_active"]]

    combined_df.to_csv("dude_ace.csv", index=False)

示例#3

0

显示文件

def get_mols_from_files(filenames, targets, verbose=True):
    """
    Read each file into its own Pandas dataframe. File type is based on the file
    extension. Currently supported filetypes are .sdf, .smi, .csv, and .tsv.

    For each file, extract the mols, stats, and the molecules that require review.

    Bring cleaned mols from all files into one list, /all_mols/, and all mols
    requiring review into one dict, /all_for_review/.
    """

    all_for_review = {}
    all_mols = []

    for filename in filenames:
        logging.info(filename)

        # Determine the type of the filename by the extension
        file_ext = pathlib.Path(filename).suffix
        ## Mol_field should probably be a passable agument, defaulting to "mol"?
        mol_field = "mol"

        # Read file depending on file extension
        if file_ext == ".sdf":
            df = PandasTools.LoadSDF(filename, molColName=mol_field)
        elif file_ext in [".csv", ".tsv", ".smi"]:
            sep = ","
            if file_ext == ".tsv":
                sep = "\t"
            if file_ext == ".smi":
                mol_field = "smiles"
            df = pandas.read_csv(filename, sep=sep)
        else:
            # TODO Throw an error
            pass

        # Stats is never used?
        mols, stats, for_review = get_activities(df,
                                                 original_filename=filename,
                                                 activity_fields=targets,
                                                 mol_field=mol_field)

        # Report the number of mols with activity for each target
        for target in targets:
            # We iterate over all the mols A LOT. Can that be reduced at all?
            # Also why did we make and return the /stats/ dict if we were just going to count
            # the stuff in /mols/ to get the same info???
            t = [x for x in mols if x.has_activity(target)]
            logging.info(f"{filename} {target} hits: {len(t)}")

        # Add the mols from the file to the list of all mols
        all_mols.extend(mols)
        # Add the mols that require review from this file to the dict of all mols requiring review
        extend_dict(all_for_review, for_review)

    # Return the list of /all_mols/ that have at least one valid activity and the mols that need to be reviewed
    return all_mols, all_for_review

示例#4

0

显示文件

 def test_all_numeric_with_no_numeric_columns(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertFalse(">" in s, s)
     self.assertNotIn(
         "7\n\n",
         s)  # double-check that the numeric tests don't pass by accident
     self.assertNotIn("8\n\n", s)

示例#5

0

显示文件

 def test_specify_numeric_column(self):
     sio = StringIO()
     df = self.df
     df["len2"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, properties=["len2"])
     s = sio.getvalue()
     self.assertEqual(s.count("<len2>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)

示例#6

0

显示文件

文件： cal_dERMSD.py 项目： zhenglz/deltaVinaXGB

def num_structure_change(confs, native):
    ''' Get number of conformations satisfying requirements --> for entropy '''
    df_confs = PandasTools.LoadSDF(confs)
    df_confs["energy_abs"] = df_confs["energy_abs"].astype(float)
    lowest = df_confs.sort_values(["energy_abs"]).energy_abs.min()
    num_1 = df_confs[df_confs["energy_abs"] < lowest + 1.0].shape[0]
    num_2 = df_confs[df_confs["energy_abs"] < native].shape[0]
    
    return num_1, num_2

示例#7

0

显示文件

def load_valid_atom_or_bond_features(path: str,
                                     smiles: List[str]) -> List[np.ndarray]:
    """
    Loads features saved in a variety of formats.

    Supported formats:

    * :code:`.npz` descriptors are saved as 2D array for each molecule in the order of that in the data.csv
    * :code:`.pkl` / :code:`.pckl` / :code:`.pickle` containing a pandas dataframe with smiles as index and numpy array of descriptors as columns
    * :code:'.sdf' containing all mol blocks with descriptors as entries

    :param path: Path to file containing atomwise features.
    :return: A list of 2D array.
    """

    extension = os.path.splitext(path)[1]

    if extension == '.npz':
        container = np.load(path)
        features = [container[key] for key in container]

    elif extension in ['.pkl', '.pckl', '.pickle']:
        features_df = pd.read_pickle(path)
        if features_df.iloc[0, 0].ndim == 1:
            features = features_df.apply(
                lambda x: np.stack(x.tolist(), axis=1), axis=1).tolist()
        elif features_df.iloc[0, 0].ndim == 2:
            features = features_df.apply(
                lambda x: np.concatenate(x.tolist(), axis=1), axis=1).tolist()
        else:
            raise ValueError(
                f'Atom/bond descriptors input {path} format not supported')

    elif extension == '.sdf':
        features_df = PandasTools.LoadSDF(path).drop(
            ['ID', 'ROMol'], axis=1).set_index('SMILES')

        features_df = features_df[~features_df.index.duplicated()]

        # locate atomic descriptors columns
        features_df = features_df.iloc[:, features_df.iloc[
            0, :].apply(lambda x: isinstance(x, str) and ',' in x).to_list()]
        features_df = features_df.reindex(smiles)
        if features_df.isnull().any().any():
            raise ValueError(
                'Invalid custom atomic descriptors file, Nan found in data')

        features_df = features_df.applymap(lambda x: np.array(
            x.replace('\r', '').replace('\n', '').split(',')).astype(float))

        features = features_df.apply(lambda x: np.stack(x.tolist(), axis=1),
                                     axis=1).tolist()

    else:
        raise ValueError(f'Extension "{extension}" is not supported.')

    return features

示例#8

0

显示文件

def get_chembl(terms_to_keep):
    sdf_file = '/project/projectdirs/openmsi/projects/compound_data/chembl/chembl_21.sdf.gz'
    df = PandasTools.LoadSDF(sdf_file)
    df['source_database'] = 'chembl'
    k = list(df.keys())
    for t in terms_to_keep:
        if not t in k:
            df[t] = ''
    return df

示例#9

0

显示文件

def main():
    args = getArgs()
    print(args.infile, args.outfile)
    smiles_df = pd.read_csv(args.infile)
    pp = smiles_df[['rdkit_smiles', 'compound_id']]

    PandasTools.AddMoleculeColumnToFrame(pp, 'rdkit_smiles', 'Molecule')

    for index, row in pp.iterrows():
        row['Molecule'] = Chem.AddHs(row['Molecule'])
        AllChem.EmbedMolecule(row['Molecule'])
        pp.at[index, 'i_user_TOTAL_CHARGE'] = Chem.rdmolops.GetFormalCharge(
            row['Molecule'])

    PandasTools.WriteSDF(pp,
                         args.outfile,
                         molColName='Molecule',
                         idName='compound_id',
                         properties=list(pp.columns))

示例#10

0

显示文件

文件： rdkit_easy.py 项目： CBIIT/NCI-DOE-Collab-ATOM-Modeling-Pipeline-AMPL

def add_mol_column(df, smiles_col, molecule_col='mol'):
    """
    Add a column 'molecule_col' to data frame 'df' containing RDKit Mol objects
    corresponding to the SMILES strings in column 'smiles_col'.
    """
    PandasTools.AddMoleculeColumnToFrame(df,
                                         smiles_col,
                                         molecule_col,
                                         includeFingerprints=True)
    return df

示例#11

0

显示文件

def test_force_Kekulize():
    df = pd.read_csv('All_Moles_Tested_Data.csv')
    i= 0
    mol_list = []
    for smile in df['smiles']:
        mol = MolFromSmiles(smile)
        x = find_custom_Kekulize_set(smile,  max_atoms= 60,  max_degree= 5,printMe = False)
        for index in x:
            mol.GetAtomWithIdx(index).SetAtomicNum(32)
            
        mol_list.append(mol)
        
    df['mol'] = pd.DataFrame({'mol':mol_list})
    
    unit = 5
    for i in range(0,len(df)//unit):
        display(PandasTools.FrameToGridImage(df.iloc[i*unit:i*unit+unit],column='mol', legendsCol='',molsPerRow=unit))
    if((len(df)%unit>0)*1):
        display(PandasTools.FrameToGridImage(df.iloc[len(df)//unit*unit:len(df)],column='mol', legendsCol='',molsPerRow=unit))

示例#12

0

显示文件

文件： pipeline.py 项目： leelasdSI/rdkit_ipynb_tools

def stop_df_from_stream(stream, summary=None, comp_id="stop_df_from_stream"):
    """Generates a Pandas DataFrame out of the data stream.
    The molecules need to be present in the stream,
    e.g. generated by `pipe_mol_from_smiles`."""

    if not PANDAS:
        raise ImportError("pandas is not available.")
    PT.RenderImagesInAllDataFrames(images=True)
    df = pandas.DataFrame.from_dict(stop_dict_from_stream(stream, summary=summary, comp_id=comp_id))
    return df

示例#13

0

显示文件

def main():
    args = UserInput()
    if args.name:
        name = args.name
    else:
        name = 'ID'
    if args.score:
        score = args.score
    else:
        score = 'Chemgauss4'
    if args.dock:
        dock = args.dock
    else:
        dock = 'fred'
    if args.top:
        top = int(args.top)
    else:
        top = -1  # all

    df = rdpd.LoadSDF(args.infile,
                      removeHs=False,
                      molColName='ROMol',
                      idName='mol_ID')[:top].fillna('')
    print('\033[34m> select mol: \033[32m{0}\033[0m'.format(len(df)))
    df[score] = df[score].apply(float)
    df['Rank'] = df.index

    for idx, row in df.iterrows():
        df['ROMol'][idx].SetProp(
            '_Name',
            '{0}::{1}::{2:.2f}::{3}'.format(row[name], row['Rank'] + 1,
                                            row[score], dock))

    sdf_out = '{0}.{1}_docked.sdf.gz'.format(args.outpref, dock)
    csv_out = '{0}.{1}_docked.txt.bz2'.format(args.outpref, dock)

    rdpd.WriteSDF(df, sdf_out, properties=list(df.columns))
    df.to_csv(csv_out,
              header=False,
              index=False,
              sep='\t',
              columns=[name, score],
              float_format='%.3f')

示例#14

0

显示文件

文件： QueryHandler.py 项目： SohanCSERU/PhytoChem

def update_sdf():
    compounds_df = pd.DataFrame(list(Compound.objects.all().values())).drop(
        ['id', 'created_at', 'updated_at'], axis=1)
    PandasTools.AddMoleculeColumnToFrame(compounds_df,
                                         'Smiles',
                                         'ROMol',
                                         includeFingerprints=True)
    if not os.path.exists('media'):
        os.makedirs('media')
    df_to_sdf(compounds_df, 'media/all_data.sdf')

示例#15

0

显示文件

    def add_mol_to_frame(self):
        """
        Adds a image file of the ligand to the :py:class:`pandas.Dataframe`

        :return: None
        """
        PandasTools.AddMoleculeColumnToFrame(
            self._data, smilesCol="smiles", molCol="ROMol", includeFingerprints=False
        )
        self._data["ROMol"].apply(lambda x: x[0])

示例#16

0

显示文件

文件： UnitTestPandasTools.py 项目： sb123456789sb/rdkit

 def test_write_to_sdf_gz(self):
     dirname = tempfile.mkdtemp()
     try:
         filename = os.path.join(dirname, "test.sdf.gz")
         PandasTools.WriteSDF(self.df, filename)
         s = gzip.open(filename).read()
         self.assertEqual(s.count("\n$$$$\n"), 2)
         self.assertEqual(s.split("\n", 1)[0], "Methane")
     finally:
         shutil.rmtree(dirname)

示例#17

0

显示文件

    def test_FrameToGridImage(self):
        # This test only makes sure that we get no exception. To see the created images, set
        # interactive to True
        interactive = False
        self.assertTrue(True)
        df = self.df

        result = PandasTools.FrameToGridImage(df)
        if interactive:
            result.show()

        result = PandasTools.FrameToGridImage(
            df, legendsCol='PUBCHEM_IUPAC_INCHIKEY')
        if interactive:
            result.show()

        result = PandasTools.FrameToGridImage(df, legendsCol=df.index.name)
        if interactive:
            result.show()

示例#18

0

显示文件

文件： sdf_NeverSeeFilter_separator.py 项目： Tmacme/Structure-Based_docking

def main():
    args = UserInput()

    df = RDkitRead(args.infile, args.id, removeHs=False, add_Hs=False)

    nsee_df = df[df['NeverSee_Groups'] == 'Y']
    len(nsee_df)
    pass_df = df[df['NeverSee_Groups'] == 'N']
    len(pass_df)

    print('\033[34m Passed NeverSee Filter: \033[32m{0}\033[0m'.format(
        len(pass_df)))
    print('\033[34m Failed NeverSee Filter: \033[31m{0}\033[0m'.format(
        len(nsee_df)))

    if re.search(r'.smi', args.nsee_file, re.IGNORECASE):
        nsee_df.smiles = nsee_df.MOL.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        nsee_df.to_csv(args.nsee_file,
                       columns=['smiles', 'ID'],
                       sep=' ',
                       header=False,
                       index=False)
    else:
        rdpd.WriteSDF(nsee_df,
                      args.nsee_file,
                      molColName='MOL',
                      properties=list(nsee_df.columns))

    if re.search(r'.smi', args.pass_file, re.IGNORECASE):
        pass_df.smiles = pass_df.MOL.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        pass_df.to_csv(args.pass_file,
                       columns=['smiles', 'ID'],
                       sep=' ',
                       header=False,
                       index=False)
    else:
        rdpd.WriteSDF(pass_df,
                      args.pass_file,
                      molColName='MOL',
                      properties=list(pass_df.columns))
    print('')

示例#19

0

显示文件

文件： cluster.py 项目： lenselinkbart/APCluster

def parse_sd_file(file, tgz=False):
    """
    parse a sd file and return molecules
    """
    if tgz == True:
        file = gzip.open(file)
    data = PandasTools.LoadSDF(file,
                               molColName='Molecule',
                               smilesName='smiles')
    return data

示例#20

0

显示文件

    def compute_unique_smiles(self,
                              interp_df,
                              embeddings,
                              embedding_funct,
                              scaled_radius=0.5):
        """
        Identify duplicate SMILES and distorts the embedding. The input df
        must have columns 'SMILES' and 'Generated' at 0th and 1st position.
        'Generated' colunm must contain boolean to classify SMILES into input
        SMILES(False) and generated SMILES(True).

        This function does not make any assumptions about order of embeddings.
        Instead it simply orders the df by SMILES to identify the duplicates.
        """
        distance = self._compute_radius(scaled_radius)

        for i in range(5):
            smiles = interp_df['SMILES'].sort_values()
            duplicates = set()
            for idx in range(0, smiles.shape[0] - 1):
                if smiles.iat[idx] == smiles.iat[idx + 1]:
                    duplicates.add(smiles.index[idx])
                    duplicates.add(smiles.index[idx + 1])

            if len(duplicates) > 0:
                for dup_idx in duplicates:
                    if interp_df.iat[dup_idx, 1]:
                        # add jitter to generated molecules only
                        embeddings[dup_idx] = self.addjitter(
                            embeddings[dup_idx], distance, 1)
                smiles = embedding_funct(embeddings)
            else:
                break

        # Ensure all generated molecules are valid.
        for i in range(5):
            PandasTools.AddMoleculeColumnToFrame(interp_df,'SMILES')
            invalid_mol_df = interp_df[interp_df['ROMol'].isnull()]

            if not invalid_mol_df.empty:
                invalid_index = invalid_mol_df.index.to_list()
                for idx in invalid_index:
                    embeddings[idx] = self.addjitter(embeddings[idx],
                                                        distance,
                                                        cnt=1)
                smiles = embedding_funct(embeddings)
            else:
                break

        # Cleanup
        if 'ROMol' in interp_df.columns:
            interp_df = interp_df.drop('ROMol', axis=1)

        return interp_df

示例#21

0

显示文件

def get_all(self):
    t2.set('')
    t_sol.set('')
    t_lip.set('')
    t_sasc.set('')



    print('molecule')

    print(molecule[0])
    #print('canocical_smile', molecule[0].canonical_smiles)
    print('isomeric_smile',  molecule[0].isomeric_smiles)
    mol_canonical_smiles = molecule[0].canonical_smiles
    mol_isomeric_smiles  = molecule[0].isomeric_smiles
    t2.set(mol_isomeric_smiles)

    mol_ = Chem.MolFromSmiles(mol_isomeric_smiles)

    Draw.MolToFile(mol_, 'tmp.png')

    global image_
    image_open = Image.open('tmp.png')
    image_ = ImageTk.PhotoImage(image_open, master=frame1)

    canvas.create_image(150,75, image=image_)

    smiles = t2.get()

    df = pd.DataFrame({'name': [t1.get()], 'smiles' : [t2.get()], 'solubility': [0.00]})
    #df = pd.DataFrame([])
    df.to_csv('tmp.csv')


    graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()

    loader_p = dc.data.data_loader.CSVLoader( tasks = ['solubility'], smiles_field = "smiles", id_field = "name", featurizer = graph_featurizer )
    predictset = loader_p.featurize( 'tmp.csv' )

    prediction_sol =  model_sol.predict(predictset)
    t_sol.set(round(10**prediction_sol[0][0],3))

    prediction_lip =  model_lip.predict(predictset)
    t_lip.set(round(10**prediction_lip[0][0],3))


    PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

    sa_score = df.ROMol.map(sascorer.calculateScore)

    t_sasc.set(round(sa_score[0],2))

    #print(df['calc_SA_score'])
    print(sa_score[0])

示例#22

0

显示文件

def get_most_common_fragments(fragments, top_x=50):
    """
    Get most common fragments.
    
    Parameters
    ----------
    fragments : pandas.DataFrame
        Fragment details, i.e. SMILES, kinase groups, and fragment RDKit molecules, for input subpocket.
    top_x : int
        Top x most common fragments.
        
    Returns
    -------
    pandas.DataFrame
        Most common fragments (sorted in descending order), including fragments' SMILES, ROMol, and count.
    """

    # Get number of occurrences (count) per fragment (based on SMILES) in decending order
    fragment_counts = fragments.smiles.value_counts()
    fragment_counts.name = "fragment_count"

    # Cast Series to DataFrame and add ROMol column
    fragment_counts = fragment_counts.reset_index().rename(
        columns={"index": "smiles"})
    PandasTools.AddMoleculeColumnToFrame(fragment_counts, "smiles")

    # Sort fragments by their count (descending)
    fragment_counts.sort_values("fragment_count",
                                ascending=False,
                                inplace=True)
    fragment_counts.reset_index(inplace=True, drop=True)

    # Set molecule ID as index name
    fragment_counts.index.name = "molecule_id"

    # Get the top X most common fragments
    if fragment_counts.shape[0] < top_x:

        # Select all fragments if there are less than top X fragments in subpocket
        most_common_fragments = fragment_counts

    else:

        # If multiple fragments have the same count but some make it into the top X and some not,
        # include the latter also

        # Get lowest fragment count that is included in top X fragments
        lowest_fragment_count = fragment_counts.iloc[top_x - 1].fragment_count

        # Get all fragments with more or equal to the lowest fragment count
        most_common_fragments = fragment_counts[
            fragment_counts.fragment_count >= lowest_fragment_count]

    return most_common_fragments

示例#23

0

显示文件

    def __init__(self, data, output_name):
        output = StringIO()
        compounds_df = pd.DataFrame(list(data.values())).drop('id', axis=1)
        PandasTools.AddMoleculeColumnToFrame(compounds_df,
                                             'Smiles',
                                             'ROMol',
                                             includeFingerprints=True)
        PandasTools.WriteSDF(compounds_df,
                             output,
                             molColName='ROMol',
                             idName='PID',
                             properties=list(compounds_df.columns))

        mimetype = 'text/plain'
        file_ext = 'sdf'
        output.seek(0)
        super(SDFResponse, self).__init__(content=output.getvalue(),
                                          content_type=mimetype)
        self['Content-Disposition'] = 'attachment;filename="%s.%s"' % \
                                      (output_name.replace('"', '\"'), file_ext)

示例#24

0

显示文件

def molgrid_image(smiles, file_name, labels=None, molPerRow=5):
    df = pd.DataFrame({'smiles': smiles})
    PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol')
    if labels is None:
        labels = ['{:d}'.format(i) for i in df.index]
    svg = Draw.MolsToGridImage(df['mol'],
                               molsPerRow=5,
                               legends=labels,
                               useSVG=True)
    save_svg(svg, file_name + '.svg', dpi=150)
    return

示例#25

0

显示文件

def readProjectData(filename, FP, smilesCol):
    # reads in the project data and calculates fingerprints
    df_proj=pd.read_csv(filename)#,names=['ID','Structure','mol name','scaffold','series assignment','assay'], skiprows=[0])
    #df_proj = df_proj.head(100)
    PandasTools.AddMoleculeColumnToFrame(df_proj,smilesCol=smilesCol,molCol='Molecule')
    df_proj=df_proj.loc[df_proj['Molecule'].map(lambda x: x is not None)]
    if FP=='Morgan2':
        df_proj['FP']=df_proj.Molecule.map(lambda x : AllChem.GetMorganFingerprint(x,2))
    else: 
        print(FP, ' fingerprint not implemented.')
        return
    return df_proj

示例#26

0

显示文件

文件： main.py 项目： zinph/MacrolactoneDB

def return_files_sdf():
    df = pd.read_pickle('temp.pickle')
    PandasTools.WriteSDF(df,
                         'temp.sdf',
                         molColName='structures',
                         properties=list(df.columns),
                         allNumeric=False)
    try:
        result = send_file('temp.sdf', as_attachment=True)
        return result
    except Exception as e:
        return str(e)

示例#27

0

显示文件

 def binding_affinity(self,
                      prot_in,
                      lig_in,
                      outpath="results/results_affinity_binding.csv"):
     DF = self.preprocessing(prot_in, lig_in)
     X = DF.iloc[:, 2:]
     print(DF.columns)
     logger.info(X.shape)
     jl_filename = "models/gbdt_regression.joblib"
     cl_filename = "models/gbdt_model.joblib"
     if os.path.isfile(jl_filename) is True:
         with open(jl_filename, 'rb') as file:
             models = joblib.load(file)
             y = pd.Series(models.predict(X))
             ya = y.rename("predicted_affinity")
     else:
         logger.info("no model available")
     if os.path.isfile(cl_filename) is True:
         with open(cl_filename, 'rb') as file:
             models = joblib.load(file)
             yb = pd.Series(models.predict_proba(X)[:, 1])
     else:
         logger.info("no model available")
     smiles = DF["smiles"]
     prot = DF["UniProtID"]
     final = pd.concat([smiles, prot, ya, yb], axis=1)
     final.columns = ["smiles", "Uniprot ID", "affinity", "probability"]
     final["predicted_label"] = np.where(
         final.probability > 0.7, "high",
         np.where(final.probability < 0.4, "low", "medium"))
     logger.info(final.columns)
     logger.info(final.columns)
     logger.info(final[0:10])
     final.to_csv(outpath)
     pp_out = "results/affinity_out.sdf"
     PandasTools.AddMoleculeColumnToFrame(final, 'smiles', 'Molecule')
     PandasTools.WriteSDF(final,
                          pp_out,
                          molColName='Molecule',
                          properties=list(final.columns))

示例#28

0

显示文件

文件： sdf_to_dataset.py 项目： Matrix-Groups/pharml

def split_sdf(sdf_file_name, outdir="data/"):
    print("Loading sdf.")
    # Parse the SDF file into a Pandas dataframe.
    rdk_lg = RDLogger.logger()
    rdk_lg.setLevel(RDLogger.CRITICAL)
    df = PandasTools.LoadSDF(sdf_file_name,
                             smilesName='SMILES',
                             molColName='Molecule',
                             includeFingerprints=False)
    print("Raw cols = ", [str(x) for x in df.columns])
    # Select only the needed columns and merge the two PDB cols.
    df_list = [
        'PDB ID(s) for Ligand-Target Complex', 'PDB ID(s) of Target Chain',
        'SMILES', 'IC50 (nM)', 'Molecule'
    ]
    df_selected = df[df_list].copy()
    df_selected["PDB IDs"] = df_selected[
        'PDB ID(s) for Ligand-Target Complex'] + ',' + df_selected[
            'PDB ID(s) of Target Chain']
    print("Selected cols = ", [str(x) for x in df_selected.columns])
    df_selected = df_selected[["PDB IDs"] + df_list[2:]]
    # Drop any rows with missing data.
    df_selected = df_selected.replace('', np.nan)
    df_selected = df_selected.replace(',', np.nan)
    df_selected = df_selected.dropna()
    r_rows = len(df.index)
    s_rows = len(df_selected.index)
    print("Raw rows = ", r_rows)
    print("Sel rows = ", s_rows)
    print("Keep pct = %.2f%s" %
          (((float(s_rows) / float(r_rows)) * 100.0), '%'))
    # Build ligand dictionary and a protein dictionary.
    print("Building protein-ligand dictionary.")
    uligs = {}
    prots_ligs = {}
    for lndx, row in enumerate(df_selected.values):
        pdbs = row[0].split(',')
        for pdb in pdbs:
            if pdb == '':
                continue
            if pdb not in prots_ligs:
                prots_ligs[pdb] = []
            prots_ligs[pdb] += [lndx]
        uligs[lndx] = row
    print("Unique proteins = ", len(prots_ligs))
    print("Writing per-ligand output files.")
    # Write out .lig files and return the data dictionaries.
    for key in uligs:
        ndx = str(key)
        lig = uligs[key]
        write_lig_file(lig[3], outdir + "/lig/lig%s.lig" % ndx)
    return uligs, prots_ligs

示例#29

0

显示文件

    def compile_filters(self):
        RS_inchi = self.limit_RS(self.df, self.command['RS_min'],
                                 self.command['RS_max'])
        MW_inchi = self.limit_MW(self.df, self.command['MW_min'],
                                 self.command['MW_max'])
        nRing_inchi = self.limit_nRing(self.df, self.command['nRing_min'],
                                       self.command['nRing_max'])
        Lipinski_inchi = self.limit_Lipinski(self.df, self.command['Lipinski'])
        nG12Ring_inchi = self.limit_nG12Ring(self.df,
                                             self.command['nG12Ring_min'],
                                             self.command['nG12Ring_max'])
        SlogP_inchi = self.limit_SlogP(self.df, self.command['SlogP_min'],
                                       self.command['SlogP_max'])
        Sugars_inchi = self.limit_nSugars(self.df, self.command['nSugars_min'],
                                          self.command['nSugars_min'])
        nFRing_inchi = self.limit_nFusedRing(self.df,
                                             self.command['nFRing_min'],
                                             self.command['nFRing_max'])
        core_ester_inchi = self.limit_core_ester(
            self.df, self.command['core_ester_min'],
            self.command['core_ester_max'])
        naRing_inchi = self.limit_naRing(self.df, self.command['naRing_min'],
                                         self.command['naRing_max'])
        activity_reported_inchi = self.limit_activity_reported(
            self.df, self.command['activity_reported'])

        sets = [
            RS_inchi, MW_inchi, nRing_inchi, Lipinski_inchi, nG12Ring_inchi,
            SlogP_inchi, Sugars_inchi, nFRing_inchi, core_ester_inchi,
            naRing_inchi, activity_reported_inchi
        ]
        self.filtered_inchi = list(set.intersection(*sets))
        self.filtered_df = self.df.loc[self.df['InChI Keys'].isin(
            self.filtered_inchi)]
        # print(filtered_df.shape[0], ' compouds have been compiled based on your filters.')
        # smiles = filtered_df['smiles'].tolist()
        PandasTools.AddMoleculeColumnToFrame(self.filtered_df, 'smiles',
                                             'Molecule picture')

        # export csv file
        # self.filtered_df.to_csv('temp.csv', index=False)

        ## export sdf file
        # PandasTools.WriteSDF(self.filtered_df, 'temp.sdf', molColName='structures', properties=list(self.filtered_df.columns), allNumeric=False)

        # export smiles
        # self.smiles_writer()

        # self.filtered_df.to_sql(name='temp', con=db.engine, index=False)
        smiles_frame = self.frame_manage()

        return smiles_frame

示例#30

0

显示文件

def mol_diversity(smiles):
    df = pd.DataFrame({'smiles': smiles})
    PandasTools.AddMoleculeColumnToFrame(df, 'smiles', 'mol')
    fps = [
        Chem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048) for m in df['mol']
    ]
    dist_1d = tanimoto_1d(fps)
    mean_dist = np.mean(dist_1d)
    return mean_dist
    mean_rand = 0.91549  # mean random distance
    mean_diverse = 0.94170  # mean diverse distance
    norm_dist = (mean_dist - mean_rand) / (mean_diverse - mean_rand)
    return norm_dist