def calculate_unhashed_fps(self,draw_substructures=False,image_directory='./images_substructures'): # get the dictionary for the substructures idxs = [] substr_ids = [] counts=[] substructure_dictionaries = [] for mol_index,mol in enumerate(self.mols): info={} fp = _GetMorganFingerprint(mol,radius=self.max_radius,bitInfo=info) substructure_dictionary = {k:mol_index for k,v in info.iteritems() if v[0][1] in self.radii} substructure_dictionaries.append({k:mol_index for k,v in info.iteritems() if v[0][1] in self.radii}) substr_ids.append(substructure_dictionary.keys()) idxs.append([mol_index]*len(substructure_dictionary.keys())) counts.append([ len(info.values()[x]) for x in _arange(0,len(info)) if info.values()[x][0][1] in self.radii]) # get the smiles for the substructures amap = {} substructures_smiles = {k:[_MolToSmiles(_PathToSubmol(mol,_FindAtomEnvironmentOfRadiusN(mol,v[0][1],v[0][0]),atomMap=amap))] for k,v in info.iteritems() if v[0][1] in self.radii} self.substructures_smiles.update(substructures_smiles) # generate the images for the substructures if required.. if draw_substructures: if not _exists(image_directory): _makedirs(image_directory) for k,v in info.iteritems(): if k not in self.substructure_dictionary.keys() and v[0][1] in self.radii: image_name="%s/Molecule_%d_substr_%d.pdf"%(image_directory,mol_index,k) env=_FindAtomEnvironmentOfRadiusN(mol,v[0][1],v[0][0]) amap={} submol=_PathToSubmol(mol,env,atomMap=amap) _MolToFile(mol,image_name,size=(300,300),wedgeBonds=True,kekulize=True,highlightAtoms=amap.keys()) #self.substructure_dictionary = self._combine_dicts(substructure_dictionary,self.substructure_dictionary) for d in substructure_dictionaries: for k, v in d.iteritems(): l=self.substructure_dictionary.setdefault(k,[]) if v not in l: l.append(v) idxs = _array([val for sublist in idxs for val in sublist]) counts = _array([val for sublist in counts for val in sublist]) substr_ids_flattened = [val for sublist in substr_ids for val in sublist] substr_ids = _array(substr_ids_flattened) self.substructure_ids = substr_ids if len(self.reference_substructure_keys)==0: print "No input set of keys for the substructures. \nThus, the substructures present in the input molecules will be considered for the calculation of unhashed fingerprints." columns = _array(list(set(self.substructure_dictionary.keys()))) columns = _sort(columns) self.columns_unhashed = columns dimensionality_unhashed = len(columns) else: columns = _array(self.reference_substructure_keys) columns = _sort(columns) self.columns_unhashed = columns dimensionality_unhashed = len(columns) fps_unhashed_binary = _zeros((len(self.mols),dimensionality_unhashed), dtype=int) fps_unhashed_counts = _zeros((len(self.mols),dimensionality_unhashed), dtype=int) mapping = _array([(substr_ids[x]==columns).nonzero() for x in _arange(0,len(substr_ids))]) mapping = mapping.flatten() idxs = _array([idxs[x] for x in _arange(0,len(mapping)) if mapping[x].size != 0]) counts = _array([counts[x] for x in _arange(0,len(mapping)) if mapping[x].size != 0]) mapping = _array([mapping[x] for x in _arange(0,len(mapping)) if mapping[x].size != 0]) if len(mapping) == 0: print "There is no intersection between the substructures \n(i)provided in the reference key set, and\n(ii) the substructures found in the input molecules." return fps_unhashed_binary[idxs,mapping] = _ones(len(mapping)) fps_unhashed_counts[idxs,mapping] = counts self.fps_unhashed_binary = fps_unhashed_binary self.fps_unhashed_counts = fps_unhashed_counts
def ones(*shp, dtype='float64'): return _ones(shp, dtype=dtype)
def calculate_unhashed_fps(self, draw_substructures=False, image_directory='./images_substructures'): # get the dictionary for the substructures idxs = [] substr_ids = [] counts = [] for mol_index, mol in enumerate(self.mols): info = {} fp = _GetMorganFingerprint(mol, radius=self.max_radius, bitInfo=info) substructure_dictionary = { k: [mol_index] for k, v in info.iteritems() if v[0][1] in self.radii } substr_ids.append(substructure_dictionary.keys()) idxs.append([mol_index] * len(substructure_dictionary.keys())) counts.append([ len(info.values()[x]) for x in _arange(0, len(info)) if info.values()[x][0][1] in self.radii ]) # get the smiles for the substructures amap = {} substructures_smiles = { k: [ _MolToSmiles( _PathToSubmol(mol, _FindAtomEnvironmentOfRadiusN( mol, v[0][1], v[0][0]), atomMap=amap)) ] for k, v in info.iteritems() if v[0][1] in self.radii } self.substructures_smiles.update(substructures_smiles) # generate the images for the substructures if required.. if draw_substructures: if not _exists(image_directory): _makedirs(image_directory) for k, v in info.iteritems(): if k not in self.substructure_dictionary.keys( ) and v[0][1] in self.radii: image_name = "%s/Molecule_%d_substr_%d.pdf" % ( image_directory, mol_index, k) env = _FindAtomEnvironmentOfRadiusN( mol, v[0][1], v[0][0]) amap = {} submol = _PathToSubmol(mol, env, atomMap=amap) _MolToFile(mol, image_name, size=(300, 300), wedgeBonds=True, kekulize=True, highlightAtoms=amap.keys()) self.substructure_dictionary = self._combine_dicts( substructure_dictionary, self.substructure_dictionary) idxs = _array([val for sublist in idxs for val in sublist]) counts = _array([val for sublist in counts for val in sublist]) substr_ids_flattened = [ val for sublist in substr_ids for val in sublist ] substr_ids = _array(substr_ids_flattened) self.substructure_ids = substr_ids if len(self.reference_substructure_keys) == 0: print "No input set of keys for the substructures. \nThus, the substructures present in the input molecules will be considered for the calculation of unhashed fingerprints." columns = _array(list(set(self.substructure_dictionary.keys()))) columns = _sort(columns) self.columns_unhashed = columns dimensionality_unhashed = len(columns) else: columns = _array(self.reference_substructure_keys) columns = _sort(columns) self.columns_unhashed = columns dimensionality_unhashed = len(columns) fps_unhashed_binary = _zeros((len(self.mols), dimensionality_unhashed), dtype=int) fps_unhashed_counts = _zeros((len(self.mols), dimensionality_unhashed), dtype=int) mapping = _array([(substr_ids[x] == columns).nonzero() for x in _arange(0, len(substr_ids))]) mapping = mapping.flatten() idxs = _array([ idxs[x] for x in _arange(0, len(mapping)) if mapping[x].size != 0 ]) counts = _array([ counts[x] for x in _arange(0, len(mapping)) if mapping[x].size != 0 ]) mapping = _array([ mapping[x] for x in _arange(0, len(mapping)) if mapping[x].size != 0 ]) if len(mapping) == 0: print "There is no intersection between the substructures \n(i)provided in the reference key set, and\n(ii) the substructures found in the input molecules." return fps_unhashed_binary[idxs, mapping] = _ones(len(mapping)) fps_unhashed_counts[idxs, mapping] = counts self.fps_unhashed_binary = fps_unhashed_binary self.fps_unhashed_counts = fps_unhashed_counts
def ones(*shp, dtype="float64"): return _ones(shp, dtype=dtype)