def search_cocrystals(filter_solvents=True): ''' Search the whole CSD for structures that contain two different molecules with the specific settings ''' start_time = time.clock() csd = MoleculeReader('CSD') entry_reader = EntryReader('CSD') settings = search.Search.Settings() settings.only_organic = True settings.not_polymeric = True settings.has_3d_coordinates = True settings.no_disorder = True settings.no_errors = True settings.no_ions = True settings.no_metals = True pairs=[] for entry in csd: #if len(pairs)==100: # break if settings.test(entry): mol = csd.molecule(entry.identifier) mol.normalise_labels() smi= mol.smiles if smi != None: smi = smi.split('.') # We make sure that the structure consist of two different molecules if len(Remove(smi)) == 2: pairs.append(mol.identifier) # clean the list from solvents if filter_solvents: print('Solvates and hydrates will be removed') solvates=[] name_dict={} for mol1 in pairs: mol = csd.molecule(mol1) e=entry_reader.entry(mol1) name_dict[mol1]=e.chemical_name for i in range(0, (len(mol.components))): if mol.components[i].smiles in clean_smiles.SOLVENT_SMILES: solvates.append(mol.identifier) solvates = Remove(solvates) final_cocrystals = [x for x in pairs if x not in solvates] #print(name_dict) else: final_cocrystals=pairs # Clean the list from polymorphs cocrystals = remove_polymorphs(final_cocrystals) #print the time end_time = time.clock() name=[] name= [name_dict[i] for i in cocrystals] cocrystals_data= pd.concat([pd.DataFrame(cocrystals, columns=['csd_id']), pd.DataFrame(name, columns=['name'])], axis=1) cocrystals_data=cocrystals_data.dropna(axis=0) dataset_cocrystals = cocrystals_data[~cocrystals_data.name.str.contains("solvate")] dataset_cocrystals = dataset_cocrystals[~dataset_cocrystals.name.str.contains("clathrate")] print(end_time-start_time) dataset_cocrystals.to_csv('new_all_cocrystals.csv',index=False) return cocrystals
def get_entry(identifier, database="CSD"): """ input an identifier as a string and get the ccdc.entry.Entry object """ csd_reader = EntryReader(database) entry = csd_reader.entry(identifier) return entry
entries = list(df.refcode) from ccdc.search import TextNumericSearch data = [] # for e in entries: # query = TextNumericSearch() # query.add_all_identifiers(e) # hits = query.search() # data.append(hits[0].entry.publication.doi) # from pprint import pprint # # print len(data) # print len(set(data)) from ccdc.diagram import DiagramGenerator from ccdc.io import EntryReader diagram_generator = DiagramGenerator() diagram_generator.settings.font_size = 12 diagram_generator.settings.line_width = 1.6 diagram_generator.settings.image_width = 500 diagram_generator.settings.image_height = 500 csd_reader = EntryReader('CSD') mols = set([csd_reader.entry(m) for m in entries]) for i, e in enumerate(mols): img = diagram_generator.image(e) img.save("hit{}.png".format(i))