def main( ):

  args  = UserInput()
  if args.id is None:
    args.id = 'ID'
  if args.start is None:
    args.start = 0
  if args.end is None:
    args.end = -1

  df = RDkitRead(args.infile, args.id, removeHs=True, add_Hs=False)[int(args.start):int(args.end)].dropna()

  remover = SaltRemover.SaltRemover()
  normzer = rdMolStandardize.Normalizer()
  chooser = rdMolStandardize.LargestFragmentChooser(preferOrganic=True)

  ## remove salts
  print('\033[34m## Desalting moleucles...\033[0m\n')
  df['mol'] = df.MOL.apply(remover.StripMol)
  ## choose largest fragment (most Hs)
  print('\033[34m## Choosing moleucles...\033[0m\n')
  df['mol2'] = df.mol.apply(chooser.choose)
  ## clean molecule (not really relevant?)
  print('\033[34m## Cleaning moleucles...\033[0m\n')
  df['mol3'] = df.mol2.apply(normzer.normalize)
  ## rewrite SMILES with newest mol3
  print('\033[34m## Converting moleucles...\033[0m\n')
  df['smiles'] = df.mol3.apply(Chem.MolToSmiles)

  if   args.format == 'sdf':
    rdpd.WriteSDF(df, args.outpref+'.'+args.format, molColName='mol3', idName=args.id, properties=['smiles'])
  elif args.format == 'smi':
    df.to_csv(args.outpref+'.'+args.format, index=False, sep=' ', columns=['smiles',args.id], header=True)
Exemplo n.º 2
0
def df_to_sdf(compounds_df, file):
    with open(file, 'w') as fi:
        PandasTools.WriteSDF(compounds_df,
                             fi,
                             molColName='ROMol',
                             idName='PID',
                             properties=list(compounds_df.columns))
Exemplo n.º 3
0
def main(args: Namespace) -> None:
    """
    Main function of this script

    Parameters
    ----------
    args : Namespace
        Namespace object containing the parsed commandline arguments
    """
    df = read_dataset(args.infile)
    print(f'Initial: {len(df)}')

    df = cleaning(df, args.keep_props)
    print(f'After cleaning: {len(df)}')

    df = filtering(df)
    print(f'After filtering: {len(df)}')

    df = filter_by_temperature(df)
    print(f'After temperature control: {len(df)}')

    df = run_oe_tautomers(df)
    print(f'After QuacPac tautomers: {len(df)}')

    df = make_dataset_unique(df)
    print(f'After unifying dataset: {len(df)}')

    df = run_marvin_pka(df)
    print(f'After Marvin pKa: {len(df)}')

    PandasTools.WriteSDF(df,
                         args.outfile,
                         idName='RowID',
                         properties=df.columns)
Exemplo n.º 4
0
 def test_all_numeric_with_no_numeric_columns(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertFalse(">" in s, s)
     self.assertNotIn("7\n\n", s)  # double-check that the numeric tests don't pass by accident
     self.assertNotIn("8\n\n", s)
Exemplo n.º 5
0
    def search_pattern(self, inp):

        smarts, prefix = inp
        pattern = Chem.MolFromSmarts(smarts)

        m_df = self.df[self.df.mol >= pattern]
        m_df['SMARTS_Match'] = smarts

        #    Matches = [m for m in self.Mols if m.HasSubstructMatch(pattern)]
        print('\n  > Molecule matching "{0}": {1}\n'.format(smarts, len(m_df)))

        if self.ext == 'sdf':
            mol_out = (prefix + '.' + self.ext)
            rdpd.WriteSDF(m_df,
                          mol_out,
                          molColName='mol',
                          idName='ID',
                          properties=list(m_df.columns))

        else:
            mol_out = (prefix + '.' + self.ext)
            rdpd.SaveSMILESFromFrame(m_df,
                                     mol_out,
                                     molCol='mol',
                                     NamesCol='ID',
                                     isomericSmiles=True)
Exemplo n.º 6
0
 def save(self, filename_sdf: str, filename_csv: str) -> None:
     """
     Saves the results file as both and sdf and a csv. 
     Both are used since the sdf file takes longer to load than a simple csv file.
     """
     PandasTools.WriteSDF(self.source, filename_sdf, \
         molColName='Molecule', properties=list(self.source.columns))
     self.source.to_csv(filename_csv)
Exemplo n.º 7
0
 def test_specify_numeric_column(self):
     sio = StringIO()
     df = self.df
     df["len2"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, properties=["len2"])
     s = sio.getvalue()
     self.assertEqual(s.count("<len2>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
Exemplo n.º 8
0
 def test_all_numeric_with_numeric_columns(self):
     sio = StringIO()
     df = self.df
     df["len"] = df["ID"].map(len)
     PandasTools.WriteSDF(df, sio, allNumeric=True)
     s = sio.getvalue()
     self.assertEqual(s.count("<len>"), 2)
     self.assertIn("7\n\n", s)
     self.assertIn("8\n\n", s)
def main():
    args = UserInput()

    df = RDkitRead(args.infile, args.id, removeHs=False, add_Hs=False)

    nsee_df = df[df['NeverSee_Groups'] == 'Y']
    len(nsee_df)
    pass_df = df[df['NeverSee_Groups'] == 'N']
    len(pass_df)

    print('\033[34m Passed NeverSee Filter: \033[32m{0}\033[0m'.format(
        len(pass_df)))
    print('\033[34m Failed NeverSee Filter: \033[31m{0}\033[0m'.format(
        len(nsee_df)))

    if re.search(r'.smi', args.nsee_file, re.IGNORECASE):
        nsee_df.smiles = nsee_df.MOL.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        nsee_df.to_csv(args.nsee_file,
                       columns=['smiles', 'ID'],
                       sep=' ',
                       header=False,
                       index=False)
    else:
        rdpd.WriteSDF(nsee_df,
                      args.nsee_file,
                      molColName='MOL',
                      properties=list(nsee_df.columns))

    if re.search(r'.smi', args.pass_file, re.IGNORECASE):
        pass_df.smiles = pass_df.MOL.apply(
            lambda m: Chem.MolToSmiles(Chem.RemoveHs(m)))
        pass_df.to_csv(args.pass_file,
                       columns=['smiles', 'ID'],
                       sep=' ',
                       header=False,
                       index=False)
    else:
        rdpd.WriteSDF(pass_df,
                      args.pass_file,
                      molColName='MOL',
                      properties=list(pass_df.columns))
    print('')
Exemplo n.º 10
0
 def test_write_to_sdf_gz(self):
     dirname = tempfile.mkdtemp()
     try:
         filename = os.path.join(dirname, "test.sdf.gz")
         PandasTools.WriteSDF(self.df, filename)
         s = gzip.open(filename).read()
         self.assertEqual(s.count("\n$$$$\n"), 2)
         self.assertEqual(s.split("\n", 1)[0], "Methane")
     finally:
         shutil.rmtree(dirname)
Exemplo n.º 11
0
def update_sdf():
    compounds_df = pd.DataFrame(list(Compound.objects.all().values()))
    if not compounds_df.isnull:
        compounds_df = compounds_df.drop(['id', 'created_at', 'updated_at'], axis=1)
    PandasTools.AddMoleculeColumnToFrame(compounds_df, 'Smiles', 'ROMol', includeFingerprints=True)
    if not os.path.exists('media'):
        os.makedirs('media')
    with open('media/all_data.sdf', 'w') as fi:
        PandasTools.WriteSDF(compounds_df, fi, molColName='ROMol', idName='PID',
                             properties=list(compounds_df.columns))
Exemplo n.º 12
0
def return_files_sdf():
    df = pd.read_pickle('temp.pickle')
    PandasTools.WriteSDF(df,
                         'temp.sdf',
                         molColName='structures',
                         properties=list(df.columns),
                         allNumeric=False)
    try:
        result = send_file('temp.sdf', as_attachment=True)
        return result
    except Exception as e:
        return str(e)
Exemplo n.º 13
0
 def test_write_to_sdf_gz(self):
     dirname = tempfile.mkdtemp()
     try:
         filename = os.path.join(dirname, "test.sdf.gz")
         PandasTools.WriteSDF(self.df, filename)
         with gzip.open(filename) as f:
             s = f.read()
         s = s.decode('utf-8')
         s = s.replace(os.linesep, '\n')
         self.assertEqual(s.count("\n$$$$\n"), 2)
         self.assertEqual(s.split("\n", 1)[0], "Methane")
     finally:
         shutil.rmtree(dirname)
Exemplo n.º 14
0
def main():

    args = UserInput()
    if args.id_tag:
        id_tag = args.id_tag
    else:
        id_tag = 'Name'
    if args.sort_tag:
        sort_tag = args.sort_tag
    else:
        sort_tag = False

###############
## Read in the list of selected ligand ID
    n_df = pd.read_csv(args.mol_id, delimiter='\s+', header=None,
                       comment='#').dropna()
    keywords = n_df.loc[:, 0].to_list()
    print('\n > Number of items in <{}>: {}\n'.format(args.mol_id,
                                                      len(keywords)))

    ## Extract the selected ligands from the supplied SDFs
    mol_sele = []
    for infile in args.infiles:
        df = RDkitRead(infile, removeHs=False)
        Items = df['ID'].apply(CheckID)
        df['Name'] = list(zip(*Items))[0]
        df['Rank'] = list(zip(*Items))[1]
        df['Score'] = list(zip(*Items))[2]
        df['Soft'] = list(zip(*Items))[3]
        mol_sele.append(df[df[id_tag].isin(keywords)])
        del df
        gc.collect()

    all_df = pd.concat(mol_sele).reset_index(drop=True)
    found_id = all_df[id_tag].to_list()
    missed_id = [x for x in keywords if x not in set(found_id)]

    if missed_id is False:
        print(
            '\033[31m  Info: \033[35m{0}\033[31m MOL cannot be found:\033[0m'.
            format(len(missed_id)))
        print(missed_id)

    ## Sort data, if needed
    if sort_tag:
        all_df.sort_values(by=[sort_tag], ascending=True, inplace=True)

    rdpd.WriteSDF(all_df,
                  args.outpref + '.sdf.gz',
                  molColName='mol',
                  properties=list(all_df.columns))
Exemplo n.º 15
0
def get_df_download_sdf(df, download_filename, link_label, structure_column):
    f = StringIO()
    PandasTools.WriteSDF(df,
                         f,
                         molColName=structure_column,
                         properties=list(df.columns),
                         allNumeric=False)
    #csv = df.to_csv(index=False)
    data = f.getvalue()
    b64 = base64.b64encode(data.encode()).decode(
    )  # some strings <-> bytes conversions necessary here
    #href = f'<a href="data:file/csv;base64,{b64}">{link_label}</a>'
    href = f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{link_label}</a>'
    return href
Exemplo n.º 16
0
def df2sdf(df,
           output_sdf_name,
           smiles_field='canonical_smiles',
           id_field='chembl_id',
           selected_batch=None):
    '''
    pack pd.DataFrame to sdf_file
    '''
    if not selected_batch is None:
        df = df.loc[df['label'] == selected_batch]
    PandasTools.AddMoleculeColumnToFrame(df, smiles_field, 'ROMol')
    PandasTools.WriteSDF(df,
                         output_sdf_name,
                         idName=id_field,
                         properties=df.columns)

    return
Exemplo n.º 17
0
def main():
    args = UserInput()
    if args.name:
        name = args.name
    else:
        name = 'ID'
    if args.score:
        score = args.score
    else:
        score = 'Chemgauss4'
    if args.dock:
        dock = args.dock
    else:
        dock = 'fred'
    if args.top:
        top = int(args.top)
    else:
        top = -1  # all

    df = rdpd.LoadSDF(args.infile,
                      removeHs=False,
                      molColName='ROMol',
                      idName='mol_ID')[:top].fillna('')
    print('\033[34m> select mol: \033[32m{0}\033[0m'.format(len(df)))
    df[score] = df[score].apply(float)
    df['Rank'] = df.index

    for idx, row in df.iterrows():
        df['ROMol'][idx].SetProp(
            '_Name',
            '{0}::{1}::{2:.2f}::{3}'.format(row[name], row['Rank'] + 1,
                                            row[score], dock))

    sdf_out = '{0}.{1}_docked.sdf.gz'.format(args.outpref, dock)
    csv_out = '{0}.{1}_docked.txt.bz2'.format(args.outpref, dock)

    rdpd.WriteSDF(df, sdf_out, properties=list(df.columns))
    df.to_csv(csv_out,
              header=False,
              index=False,
              sep='\t',
              columns=[name, score],
              float_format='%.3f')
Exemplo n.º 18
0
    def write_sdf(self, data: pd.DataFrame, outfile_name: str,
                  smiles_column: str):
        """
            Prepares curated data to be converted into sdf file using
            PandasTools. Returns non processed molecules in excel format.

            :param data: Dataframe to be written
            :param smiles_column: SMILES column in the dataframe to be processed
            :param outfile_name: output file name
        """

        output_name_format = '.'.join([outfile_name, 'sdf'])
        cur_data = self.prepare_data_for_sdf(data, smiles_column, copy=True)

        PandasTools.WriteSDF(cur_data,
                             output_name_format,
                             molColName='ROMol',
                             properties=list(cur_data.columns),
                             idName=self.identifier)
Exemplo n.º 19
0
def main():
    args = getArgs()
    print(args.infile, args.outfile)
    smiles_df = pd.read_csv(args.infile)
    pp = smiles_df[['rdkit_smiles', 'compound_id']]

    PandasTools.AddMoleculeColumnToFrame(pp, 'rdkit_smiles', 'Molecule')

    for index, row in pp.iterrows():
        row['Molecule'] = Chem.AddHs(row['Molecule'])
        AllChem.EmbedMolecule(row['Molecule'])
        pp.at[index, 'i_user_TOTAL_CHARGE'] = Chem.rdmolops.GetFormalCharge(
            row['Molecule'])

    PandasTools.WriteSDF(pp,
                         args.outfile,
                         molColName='Molecule',
                         idName='compound_id',
                         properties=list(pp.columns))
Exemplo n.º 20
0
    def __init__(self, data, output_name):
        output = StringIO()
        compounds_df = pd.DataFrame(list(data.values())).drop('id', axis=1)
        PandasTools.AddMoleculeColumnToFrame(compounds_df,
                                             'Smiles',
                                             'ROMol',
                                             includeFingerprints=True)
        PandasTools.WriteSDF(compounds_df,
                             output,
                             molColName='ROMol',
                             idName='PID',
                             properties=list(compounds_df.columns))

        mimetype = 'text/plain'
        file_ext = 'sdf'
        output.seek(0)
        super(SDFResponse, self).__init__(content=output.getvalue(),
                                          content_type=mimetype)
        self['Content-Disposition'] = 'attachment;filename="%s.%s"' % \
                                      (output_name.replace('"', '\"'), file_ext)
Exemplo n.º 21
0
def main():
    args = parse_args()
    if (".txt" or ".csv") in args.i:
        df = parse_text_file(args.i)
    elif ("sd.gz" or "sdf.gz") in args.i:
        df = parse_sd_file(args.i, tgz=True)
    elif (".sd" or ".sdf") in args.i:
        df = parse_sd_file(args.i)
    FP = fp_from_df(df)
    #FP = preprocessing.normalize(FP)
    labels = AffinityPropagation(damping=float(args.damping),
                                 max_iter=int(args.max_iter),
                                 convergence_iter=int(
                                     args.convergence)).fit(FP).labels_
    print(metrics.silhouette_score(FP, labels, metric='euclidean'))
    df['Cluster'] = labels
    PandasTools.WriteSDF(df,
                         args.o,
                         molColName='Molecule',
                         idName="CID",
                         properties=list(df.columns))
Exemplo n.º 22
0
 def binding_affinity(self,
                      prot_in,
                      lig_in,
                      outpath="results/results_affinity_binding.csv"):
     DF = self.preprocessing(prot_in, lig_in)
     X = DF.iloc[:, 2:]
     print(DF.columns)
     logger.info(X.shape)
     jl_filename = "models/gbdt_regression.joblib"
     cl_filename = "models/gbdt_model.joblib"
     if os.path.isfile(jl_filename) is True:
         with open(jl_filename, 'rb') as file:
             models = joblib.load(file)
             y = pd.Series(models.predict(X))
             ya = y.rename("predicted_affinity")
     else:
         logger.info("no model available")
     if os.path.isfile(cl_filename) is True:
         with open(cl_filename, 'rb') as file:
             models = joblib.load(file)
             yb = pd.Series(models.predict_proba(X)[:, 1])
     else:
         logger.info("no model available")
     smiles = DF["smiles"]
     prot = DF["UniProtID"]
     final = pd.concat([smiles, prot, ya, yb], axis=1)
     final.columns = ["smiles", "Uniprot ID", "affinity", "probability"]
     final["predicted_label"] = np.where(
         final.probability > 0.7, "high",
         np.where(final.probability < 0.4, "low", "medium"))
     logger.info(final.columns)
     logger.info(final.columns)
     logger.info(final[0:10])
     final.to_csv(outpath)
     pp_out = "results/affinity_out.sdf"
     PandasTools.AddMoleculeColumnToFrame(final, 'smiles', 'Molecule')
     PandasTools.WriteSDF(final,
                          pp_out,
                          molColName='Molecule',
                          properties=list(final.columns))
def mols_to_sdbuffer(df: pd.DataFrame, props: List[str] = None) -> StringIO:
    """
    Writes a DataFrame containing a ROMol column in SD format
    to a StringIO buffer.

    Parameters
    ----------
    df : DataFrame
        DataFrame that should be written to a buffer
    props : List[str]
        List of column names that should also be written
        to the buffer

    Returns
    -------
    StringIO
        StringIO buffer containing data in SD format
    """

    buffer = StringIO()
    PandasTools.WriteSDF(df, buffer, properties=props)
    return buffer
def main(args: Namespace) -> None:
    """
    Main function of this script

    Parameters
    ----------
    args : Namespace
        Namespace object containing the parsed commandline arguments
    """

    df = PandasTools.LoadSDF(args.infile).set_index('ID',
                                                    verify_integrity=True)
    print(f'Initial: {len(df)}')

    df = cleaning(df, args.keep_props)
    print(f'After cleaning: {len(df)}')

    df = filtering(df)
    print(f'After filtering: {len(df)}')

    df = run_oe_tautomers(df)
    print(f'After QuacPac tautomers: {len(df)}')

    df = run_marvin_pka(df)
    print(f'After Marvin pKa: {len(df)}')

    df = filter_strong_outlier_by_marvin(df)
    print(f'After removing strong outlier: {len(df)}')

    df.columns = ['ROMol'] + args.keep_props + [
        'marvin_pKa', 'marvin_atom', 'marvin_pKa_type'
    ]

    PandasTools.WriteSDF(df,
                         args.outfile,
                         idName='RowID',
                         properties=df.columns)
Exemplo n.º 25
0
 def test_identifier_from_a_column(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio, idName="prop2")
     s = sio.getvalue()
     first_line = s.split("\n", 1)[0]
     self.assertEqual(first_line, "qwe")
Exemplo n.º 26
0
 def test_default_write_does_not_include_tags(self):
     sio = StringIO()
     PandasTools.WriteSDF(self.df, sio)
     s = sio.getvalue()
     self.assertNotIn(s, "prop2")
def main():
    args = UserInput()
    if args.genconf:
        GenerateConfTemplInput()
        sys.exit()
    if args.savetop:
        try:
            savetop = int(args.savetop)
            args.nosort = True  # force sorting
        except TypeError:
            sys.exit('\033[31m  ERROR: -top must be an integer: \033[0m' +
                     args.savetop)

########################
## Read input configure file
    settings = ReadConfSettings(args.conffile)
    settings['receptor'] = args.receptor
    settings['cavity'] = args.cavity
    settings['rslt_pref'] = args.rslt_pref

    if args.lig_ref:
        settings['lig_ref'] = args.lig_ref
    if args.constr:
        settings['constr_file'] = args.constr

    ## handle sdf file in gzip/bzip2
    if re.search('.gz$', args.ligand):
        ligand = args.ligand.split('/')[-1].split('.gz')[0]
        os.system('gunzip -c {0} > ./{1}'.format(args.ligand, ligand))
        settings['ligand'] = '{0}/{1}'.format(cwd, ligand)
    elif re.search('.bz2$', args.ligand):
        ligand = args.ligand.split('/')[-1].split('.bz2')[0]
        os.system('bunzip2 -c {0} > ./{1}'.format(args.ligand, ligand))
        settings['ligand'] = '{0}/{1}'.format(cwd, ligand)
    else:
        settings['ligand'] = args.ligand

    ## Write a list of gold.conf files
    Confs = GenerateConfFiles(settings)
    print('\033[34m## Generated subjobs: \033[33m{0}\033[0m'.format(
        len(Confs)))

    ## Run GOLD in parallel until all finished
    if int(args.cpu) > 0 and int(args.cpu) <= multiprocessing.cpu_count():
        core = int(args.cpu)
    else:
        core = multiprocessing.cpu_count()
    mpi = multiprocessing.Pool(core)
    tmp = [x for x in tqdm(mpi.imap(RunGOLD, Confs), total=len(Confs))]
    mpi.close()
    mpi.join()

    ############ Post-processing #############

    tmpdsf = settings['tmpdsf']
    findsf = settings['findsf']
    finssf = settings['finssf']
    mol_id = settings['mol_id']
    if settings['gold_funct'] == 'plp':
        score = 'Gold.PLP.Fitness'

    ## Modify each subjob docking result, summarize them all into 1 dataframe
    pref_list = [c.split('.conf')[0] for c in Confs]
    dock_list = []
    for pref in pref_list:
        os.chdir(pref)
        in_sdf = '{0}.{1}'.format(pref, tmpdsf)
        out_sdf = '{0}.{1}'.format(pref, findsf)

        ## modify docked sdf file, collect them
        dock_list.append(RescaleRename(in_sdf, out_sdf, mol_id, score))
        os.system('bzip2 *sdf *lst')
        os.chdir(cwd)

    ## combine all subjob data, sort by ranking, output docked sdf and rank
    ## save only top ligands if needed
    xdf = pd.concat(list(filter(None, dock_list)))
    if not args.nosort:
        xdf.sort_values(by=[score], ascending=True, inplace=True)
        if args.savetop:
            xdf = xdf[:savetop]

    fin_sdf = '{0}.{1}'.format(settings['rslt_pref'], findsf)
    fin_scr = '{0}.{1}'.format(settings['rslt_pref'], finssf)
    rdpd.WriteSDF(xdf, fin_sdf, properties=list(xdf.columns))
    xdf.to_csv(fin_scr,
               index=False,
               sep='\t',
               columns=[mol_id, score],
               header=False,
               float_format='%.3f')
    os.system('bzip2 {0} {1}'.format(fin_sdf, fin_scr))
print('Loading model...')
with open('RF_CV_FMorgan3_pKa.pkl', 'rb') as f:
    model = pkl.load(f)

print('Start preparing dataset...')
df = cleaning(df, list(df.columns[df.columns != 'ROMol']))
print(f'After cleaning: {len(df)}')

df = filtering(df)
print(f'After filtering: {len(df)}')

df = run_oe_tautomers(df)
print(f'After QuacPac tautomers: {len(df)}')

print('Calculating fingerprints...')
fmorgan3 = []
for mol in df.ROMol:
    fmorgan3.append(
        Chem.GetMorganFingerprintAsBitVect(mol,
                                           radius=3,
                                           nBits=4096,
                                           useFeatures=True))
fmorgan3 = np.array(fmorgan3)

print('Predicting...')
df['pKa_prediction'] = model.predict(fmorgan3)

print('Writing result file...')
PandasTools.WriteSDF(df, args.out, properties=df.columns, idName='RowID')
Exemplo n.º 29
0
        new_row = [ID] + list(calc.CalcDescriptors(row['ROMol']))
        values.append(new_row)
        count += 1
    df_result = pd.DataFrame(values, columns=columns)
    return df_result


# In[5]:

descriptor_df = gen_descriptors(original_df, calc)

# In[22]:

merged = descriptor_df.join(original_df, on='ID')
merged[['MolWt', 'FW']]

# In[38]:

PandasTools.WriteSDF(merged,
                     out='data/fulldata.sdf',
                     properties=list(merged.columns))

# In[41]:

loaded_df = PandasTools.LoadSDF('data/fulldata.sdf')
loaded_df.info()

# In[46]:

loaded_df[['Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10']]
Exemplo n.º 30
0
                    help="dataset type",
                    type=str,
                    choices=['regression', 'classification'],
                    required=True)
args = parser.parse_args()

DATASETS = OrderedDict()
DATASETS[args.data_name] = (args.dataset_type, args.data_path)

os.makedirs('sdfs', exist_ok=True)
for name in DATASETS:
    print(name)
    all_smiles = []
    with open(DATASETS[name][1],
              'r') as rf, open(os.path.join('sdfs', name + '_smiles.csv'),
                               'w') as wf:
        rf.readline()
        for line in rf:
            smiles = line.strip().split(',')[0]
            wf.write(smiles + '\n')
            all_smiles.append(smiles)

    filename = os.path.join('sdfs', name + '_smiles.csv')
    pp = pd.DataFrame(all_smiles, columns=['Smiles'])
    PandasTools.AddMoleculeColumnToFrame(
        pp, 'Smiles', 'Molecule')  # pp = doesn't work for me
    PandasTools.WriteSDF(pp,
                         os.path.join('sdfs', name + '.sdf'),
                         molColName='Molecule',
                         properties=list(pp.columns))