def WriteSDFDataSelect(Top_List, Top_sdf, arg_pat, all_top, top_name, dock):

    Others, Matched = [], []
    #w   = Chem.SDWriter(top_name+'.smt-filt.sdf')
    #OUT = open(top_name+'.smt-filt.txt', 'w')
    m = Chem.SDWriter(top_name + '.smt-selec.sdf')
    SMA = open(top_name + '.smt-selec.txt', 'w')

    ## Use the Ranked list to rebuild a consolidated SDF
    ## if molecule matches SMARTS filter, separate it
    for idx, Item in enumerate(Top_List):

        score, name = Item[0], Item[1]
        ## If mol_name has conformer number appended on it, remove _NUM
        if re.search(r'_', name):
            name = name.split('_')[0]

        if Top_sdf.get(name):
            mol = Top_sdf[name]
            switch = False

            ## Rename mol name property to include data (ZINC, Rank, Score, Software)
            mol.SetProp(
                '_Name',
                '{0}::{1}::{2:.1f}::{3}'.format(name, idx + 1, float(score),
                                                dock))

            for smarts in [p for p in arg_pat.split('|')]:
                if mol.HasSubstructMatch(Chem.MolFromSmarts(smarts)):
                    Matched.append(mol)
                    SMA.write('{0}\t{1}\n'.format(name, score))
                    switch = True
                    break
            if not switch:
                #        print(' ** {0} not match SMARTS {1} - Skip '.format(name, arg_pat))
                Others.append(mol)
#        OUT.write('{0}\t{1}\n'.format(name, score))
        else:
            print(' --> Molecule not found: {0} <--'.format(name))
            continue

        ## Close all files when reached the Max. output number
        if len(Matched) == all_top or idx == len(Top_List) - 1:
            #      for mol in Others:
            #        w.write(mol)
            for mol in Matched:
                m.write(mol)
            print("\n ## Total Molecule Looked Thru: " + str(idx + 1))
            print(' ## Molecule Not Matched: ' + str(len(Others)))
            print(' ## Molecule Matched {0}: {1}'.format(
                arg_sel, len(Matched)))
            #      OUT.close()
            SMA.close()
            #      w.flush()
            #      w.close()
            m.close()
            gc.collect()
            break

    if grid is True: grid_print(top_name, Matched, 'sdf')
예제 #2
0
def main(list_name, Chemicals, option):
    ## Read in the list of selected ligand ID
    #  List = remove_remark(file_handle(list_name))
    List = [line.split()[0] for line in remove_remark(file_handle(list_name))]
    print len(List)

    ## Extract the selected ligands from the supplied SDF
    temp = rdkit_open(Chemicals)

    sdf = dict()
    for m in temp:
        if re.search(r'::', m.GetProp('_Name')):
            name, rank, score, x = m.GetProp('_Name').split('::')
            sdf[name] = [m, name, rank, score]
        else:
            name = m.GetProp('_Name')
            sdf[name] = [m, name, 0, 0.0]
    Molecules = [sdf[chem] for chem in List if chem is not None]

    ## Sort data, if needed
    if option is not None:
        if option == 'name': Molecules.sort(key=lambda tup: tup[1])
        elif option == 'rank': Molecules.sort(key=lambda tup: int(tup[2]))
        elif option == 'score': Molecules.sort(key=lambda tup: float(tup[3]))

    Mols = [mol[0] for mol in Molecules]
    out = Chem.SDWriter(list_name.split('.txt')[0] + '.sdf')
    for molecule in Mols:
        out.write(molecule)
    out.flush()
    out.close()

    grid_print(list_name.split('.txt')[0], Mols, 'sdf')
def WriteSDFDataExclude( Top_List, Top_sdf, arg_pat, all_top, top_name, dock ):

  Select, Exclude = [], []
  w   = Chem.SDWriter(top_name+'.smt-clean.sdf')
  OUT = open(top_name+'.smt-clean.txt', 'w')
#  m   = Chem.SDWriter(top_name+'.smt-excl.sdf')
#  SMA = open(top_name+'.smt-excl.txt', 'w')

  ## Use the Ranked list to rebuild a consolidated SDF 
  ## if molecule matches SMARTS filter, separate it 
  for idx, Item in enumerate(Top_List):

    score, name = Item[0], Item[1]
    ## If mol_name has conformer number appended on it, remove _NUM
    if re.search(r'_', name):
      name = name.split('_')[0]

    if Top_sdf.get(name):
      mol    = Top_sdf[name]
      switch = False

      ## Rename mol name property to include data (ZINC, Rank, Score, Software)
      mol.SetProp('_Name',
          '{0}::{1}::{2:.1f}::{3}'.format(name, idx+1, float(score), dock) )

      for smarts in [ p for p in arg_pat.split('|') ]:
        if mol.HasSubstructMatch(Chem.MolFromSmarts(smarts)):
#          print(' ** {0} matches SMARTS {1} - Skip '.format(name, smarts))
          Exclude.append(mol)
#          SMA.write(name+'\t'+str(score)+'\n')
          switch = True
          continue
      if switch:
        continue
    else:
      print(' --> Molecule not found: {0} <--'.format(name))
      continue

    OUT.write(name+"\t"+str(score)+"\n")
    Select.append(mol)

    ## Close all files when reached the Max. output number
    if len(Select) == all_top:
      for mol in Select:
        w.write(mol)
#      for mol in Exclude:
#        m.write(mol)
      print("\n ## Total Molecule Looked Thru: "+str(idx+1))
      print(' ## Molecule Selected: '+str(len(Select)))
      print(' ## Molecule Matched {0}: {1}'.format(arg_exc,len(Exclude)))
      OUT.close()
#      SMA.close()
      w.flush()
      w.close()
#      m.close()
      gc.collect()
      break

  if grid is True: grid_print(top_name, Select, 'sdf')
def WriteSDFData(Top_List, Top_sdf, all_top, top_name, dock):

    Select = []
    w = Chem.SDWriter(top_name + '.sdf')
    OUT = open(top_name + '.txt', 'w')

    ## Use the Ranked list to rebuild a consolidated SDF
    for idx, Item in enumerate(Top_List):

        score, name = Item[0], Item[1]
        ## If mol_name has conformer number appended on it, remove _NUM
        #    if re.search(r'_', name):
        #      name = name.split('_')[0]

        if Top_sdf.get(name):
            mol = Top_sdf[name]
        else:
            print(' --> Molecule not found: {0} <--'.format(name))
            continue

        ## Rename mol name property to include data (ZINC, Rank, Score, Software)
        mol.SetProp(
            '_Name', '{0}::{1}::{2:.1f}::{3}'.format(name, idx + 1,
                                                     float(score), dock))

        OUT.write('{0}\t{1}\n'.format(name, score))
        Select.append(mol)

        ## Close all files when reached the Max. output number
        if len(Select) == all_top:
            for mol in Select:
                w.write(mol)
            print("\n ## Total Molecule Looked Thru: " + str(idx + 1))
            print(' ## Total Molecule Output: ' + str(len(Select)))
            OUT.close()
            w.flush()
            w.close()
            gc.collect()
            break

    if grid is True: grid_print(top_name, Select, 'sdf')
예제 #5
0
def main(list_name, Chemicals, option):

    ## Read in the list of selected ligand ID
    df = pd.read_csv(list_name, delimiter='\s+', header=None,
                     comment='#').dropna()
    List = df.loc[:, 0].to_numpy()
    print('\n > Number of items in <{}>: {}\n'.format(list_name, len(List)))

    ## Extract the selected ligands from the supplied SDF
    print(' > List of structure file(s) read: \n', Chemicals)
    temp = rdkit_open(Chemicals)

    sdf = dict()
    for m in temp:
        if re.search(r'::', m.GetProp('_Name')):
            name, rank, score, x = m.GetProp('_Name').split('::')
            sdf[name] = [m, name, rank, score]
        else:
            name = m.GetProp('_Name')
            sdf[name] = [m, name, 0, 0.0]
    Molecules = [sdf[chem] for chem in List if chem is not None]

    ## Sort data, if needed
    if option is not None:
        if option == 'name': Molecules.sort(key=lambda tup: tup[1])
        elif option == 'rank': Molecules.sort(key=lambda tup: int(tup[2]))
        elif option == 'score': Molecules.sort(key=lambda tup: float(tup[3]))
        else:
            print(
                ' ## Using SDF tag to sort ligand order: \033[31m{0}\033[0m\n'.
                format(option))
            Molecules.sort(key=lambda tup: float(tup[0].GetProp(option)))

    Mols = [mol[0] for mol in Molecules]
    out = Chem.SDWriter(list_name.split('.txt')[0] + '.sdf')
    for molecule in Mols:
        out.write(molecule)
    out.flush()
    out.close()

    grid_print(list_name.split('.txt')[0], Mols, 'sdf')
예제 #6
0
def main(filename):

    mol_file = glob.glob(filename)[0]
    print('\n > File read: {}\n'.format(mol_file))

    if re.search(r'.sdf', mol_file):
        handle = file_handle(mol_file)

        Mol = [
            x for x in Chem.ForwardSDMolSupplier(handle, removeHs=True)
            if x is not None
        ]
        grid_print(mol_file.split('.sdf')[0], Mol, 'sdf')

    if re.search(r'.smi', mol_file):
        if re.search(r'.bz2$|.gz$', mol_file):
            print(
                '\n  ## INFO: RDKit cannot take SMILES in zipped format, only ASCII\n'
            )
        else:
            with open(mol_file, 'r') as fi:
                first_line = fi.readline()

            if re.search(r'smiles', first_line, re.IGNORECASE):
                Mol = [
                    x for x in Chem.SmilesMolSupplier(
                        mol_file, titleLine=True, delimiter=' |\t|,')
                    if x is not None
                ]
            else:
                Mol = [
                    x for x in Chem.SmilesMolSupplier(
                        mol_file, titleLine=False, delimiter=' |\t|,')
                    if x is not None
                ]

            grid_print(mol_file.split('.smi')[0], Mol, 'smi')
def GenClustTable( Mol_List, output_name, column=5 ):
  Img_Data = []
  for idx, Mols in enumerate(Mol_List):
    Img = []

    for mol in Mols:
      # Get molecule info
      m1 = mol.GetProp('Name')
      m2 = mol.GetProp('Rank')
      m3 = mol.GetProp('Score')
      m4 = mol.GetProp('Type')

      # Create tag and write out to sdf file
      mol.SetProp('Cluster', str(idx+1))
      mol.SetProp('SMILES' , Chem.MolToSmiles(mol, isomericSmiles=True))
      AssignStereochemistryFrom3D(mol)

      # Create figure using SMILES instead of 3D structure
      svg_name = '_TEMP.'+m1+'.svg'
      mol = rdMolDraw2D.PrepareMolForDrawing(mol)
      mol = Chem.RemoveHs(mol)
      AllChem.Compute2DCoords(mol)
      DrawingOptions.atomLabelFontSize=18
      
      Draw.MolToFile(mol, svg_name, size=(225,225) )
      #cairosvg.svg2png( url=svg_name, write_to=png_name, dpi=240 )
      img_link = '<img src="'+svg_name+'">'
          # Img = (image_link, Name, Rank, Score, Type)
      Img.append([img_link, m1, m2, m3, m4])

    Img_Data.append(Img)

## Print out a HTML page, in which every row has a maximum of 5 compound png.
## Every major cluster of compounds is grouped together.
## List the Name of the compound, then the Rank and Score.

  grid_print(output_name, Img_Data, 'formatted', column=5)
예제 #8
0
def main():
    Cmpd_File = sys.argv[1].split(',')
    Lib_File = sys.argv[2].split(',')
    out_pref = sys.argv[3]
    cutoff = float(sys.argv[4])
    fp_choice = sys.argv[5]

    Cmpd = rdkit_open(Cmpd_File)
    Lib = rdkit_open(Lib_File)
    Cmpd_FP = calculate_FP(Cmpd, fp_choice)
    Lib_FP = calculate_FP(Lib, fp_choice)

    Selection, Save = pick_similar_cmpd(Cmpd_FP, Lib_FP, cutoff, fp_choice)
    grid_print(out_pref, Selection, 'formatted')

    #########
    df = pd.DataFrame(Save, columns=['name',
                                     'mol']).drop_duplicates(subset='name',
                                                             keep='last')

    fs = Chem.SmilesWriter(out_pref + '.smi')
    for mol in df.to_numpy():
        fs.write(mol[1])
    fs.close()
예제 #9
0
        m = Cluster[0]
        i = m.GetProp("_Name").split()[0]
        a = 'TEMP.' + i + '.svg'
        l = '<img src="' + a + '">'
        h = Chem.RemoveHs(m)
        mol = h
        rdMolDraw2D.PrepareMolForDrawing(mol)
        AllChem.Compute2DCoords(mol)
        Draw.MolToFile(m, a, size=(200, 200))

        Single_list.append([l, i, "-", "-", '-'])  #(Mol, Name, Rank, Score)
    else:
        clust_list = []
        for m in Cluster:
            i = m.GetProp("_Name").split()[0]
            a = 'TEMP.' + i + '.svg'
            l = '<img src="' + a + '">'
            h = Chem.RemoveHs(m)
            mol = h
            rdMolDraw2D.PrepareMolForDrawing(mol)
            AllChem.Compute2DCoords(mol)
            Draw.MolToFile(m, a, size=(200, 200))

            ## [mol_data, Name, Rank, Score, Type]
            clust_list.append([l, i, "-", '-', '-'])
        Multi_list.append(clust_list)

Multi_list.append(Single_list)

grid_print(sys.argv[1], Multi_list, 'formatted')
예제 #10
0
def make_sdf(SDF_Names, All_Data, all_top, dock, prefix):
    from rdkit import Chem

    ## Build a Top-Selection list, with a 1.5x head-room for failed molecules
    print("  ## User-defined output total: " + str(all_top))
    Top_Hash = {}
    Top_List = []  # [(Score, Name), ...]
    for rank, List in enumerate(All_Data):
        Top_Hash[List[1]] = List[0]
        Top_List.append(List)
        if rank == (all_top * 2) - 1: break

    ## Build a library of molecules found in the Top-Selction List
    Top_sdf = {}
    for sdf_file in SDF_Names:
        print("  # Reading SDF file: " + sdf_file)
        sdf_handle = file_handle(sdf_file)
        Temp_sdf = [
            x for x in Chem.ForwardSDMolSupplier(sdf_handle, removeHs=False)
            if x is not None
        ]
        print("  # SDF mol read in from > " + sdf_file + " <: " +
              str(len(Temp_sdf)))

        ## Rename ligand name if previously processed with '::' tag
        if re.search(r'::', Temp_sdf[0].GetProp('_Name')):
            print('  # Remove "::" tag from ligand name #')
            Temp_sdf = RenameSDF(Temp_sdf)

        prev_name = ''
        for idx, mol in enumerate(Temp_sdf):
            if idx % 10000 == 0: print " Mol compared {0}".format(idx)

            ## RDKit may not handle the molecules and make a 'NoneType' item
            ## 'Could not sanitize molecule ending'. Ignore this molecule
            try:
                name = mol.GetProp('_Name')
            except AttributeError:
                print("A molecule failed after this molecule ID: " + prev_name)
                continue
            prev_name = name
            if Top_Hash.get(name.strip()):
                Top_sdf[name.strip()] = mol
        del Temp_sdf  # Free memory

    ## Use the Ranked list to rebuild a consolidated SDF
    SDF = []
    if all_top >= 1000:
        top_name = prefix + '.' + dock + '_top' + str(all_top / 1000) + 'k'
        w = Chem.SDWriter(top_name + '.sdf')
        OUT = open(top_name + '.txt', 'w')
    else:
        top_name = prefix + '.' + dock + '_top' + str(all_top)
        w = Chem.SDWriter(top_name + '.sdf')
        OUT = open(top_name + '.txt', 'w')

    for idx, Item in enumerate(Top_List):
        score = Item[0]
        name = Item[1]
        if Top_sdf.get(name):
            mol = Top_sdf[name]
        else:
            print(" --> Molecule {0} is not found <--".format(name))
            continue

        ## If the FRED mol_name has conformer number appended on it, remove _NUM
        if re.search(r'_', name):
            name = name.split('_')[0]

            ## (ZINC, Rank, Score, Software)
        mol.SetProp(
            '_Name', name + '::' + str(idx + 1) + '::' +
            str("%.1f" % float(score)) + '::' + dock)
        w.write(mol)
        OUT.write(name + "\t" + str(score) + "\n")
        SDF.append(mol)

        ## Close all files when reached the Max. output number
        if idx == all_top - 1:
            print("\n ## Total Molecule Ouptut: " + str(idx + 1))
            OUT.close()
            w.flush()
            w.close()
            gc.collect()
            break

    if grid is True: grid_print(fred_top_name, SDF, 'sdf')