def get_feature(molgrp): nx = len(molgrp['grid_points/x']) ny = len(molgrp['grid_points/y']) nz = len(molgrp['grid_points/z']) shape = (nx,ny,nz) mapgrp = molgrp['mapped_features'] data_dict = {} # loop through all the features for data_name in mapgrp.keys(): # create a dict of the feature {name : value} featgrp = mapgrp[data_name] for ff in featgrp.keys(): subgrp = featgrp[ff] if not subgrp.attrs['sparse']: data_dict[ff] = subgrp['value'].value else: spg = sparse.FLANgrid(sparse=True,index=subgrp['index'].value,value=subgrp['value'].value,shape=shape) data_dict[ff] = spg.to_dense() return data_dict
def hdf5_grid_data(self, dict_data, data_name): """Save the mapped feature to the hdf5 file Args: dict_data (dict): feature values stored as a dict data_name (str): feature name """ # get the group og the feature feat_group = self.hdf5.require_group(self.mol_basename + '/mapped_features/' + data_name) # gothrough all the feature elements for key, value in dict_data.items(): # remove only subgroup if key in feat_group: del feat_group[key] # create new one sub_feat_group = feat_group.create_group(key) # try a sparse representation if self.try_sparse: # check if the grid is sparse or not t0 = time() spg = sparse.FLANgrid() spg.from_dense(value, beta=1E-2) if self.time: print(' SPG time %f ms' % ((time() - t0) * 1000)) # if we have a sparse matrix if spg.sparse: sub_feat_group.attrs['sparse'] = spg.sparse sub_feat_group.attrs['type'] = 'sparse_matrix' sub_feat_group.create_dataset('index', data=spg.index, compression='gzip', compression_opts=9) sub_feat_group.create_dataset('value', data=spg.value, compression='gzip', compression_opts=9) else: sub_feat_group.attrs['sparse'] = spg.sparse sub_feat_group.attrs['type'] = 'sparse_matrix' sub_feat_group.create_dataset('value', data=spg.value, compression='gzip', compression_opts=9) else: sub_feat_group.attrs['sparse'] = False sub_feat_group.attrs['type'] = 'sparse_matrix' sub_feat_group.create_dataset('value', data=value, compression='gzip', compression_opts=9)
def _context_sparse(item, treeview, position): menu = QtWidgets.QMenu() list_operations = ['Load Matrix', 'Plot Histogram'] action, actions = get_actions(treeview, position, list_operations) name = item.basename + '_' + item.name.split('/')[2] if action == actions['Load Matrix']: subgrp = item.data_file[item.name] data_dict = {} if not subgrp.attrs['sparse']: data_dict[item.name] = subgrp['value'].value else: molgrp = item.data_file[item.parent.parent.parent.name] grid = {} lx = len(molgrp['grid_points/x'].value) ly = len(molgrp['grid_points/y'].value) lz = len(molgrp['grid_points/z'].value) shape = (lx, ly, lz) spg = sparse.FLANgrid(sparse=True, index=subgrp['index'].value, value=subgrp['value'].value, shape=shape) data_dict[name] = spg.to_dense() treeview.emitDict.emit(data_dict) if action == actions['Plot Histogram']: value = item.data_file[item.name]['value'].value data_dict = {'value': value} treeview.emitDict.emit(data_dict) cmd = "%matplotlib inline\nimport matplotlib.pyplot as plt\nplt.hist(value,25)\nplt.show()\n" data_dict = {'exec_cmd': cmd} treeview.emitDict.emit(data_dict)
def visualize3Ddata(hdf5=None, mol_name=None, out=None): ''' This function can be used to generate cube files for the visualization of the mapped data in VMD Usage python generate_cube_files.py <mol_dir_name> e.g. python generate_cube_files.py 1AK4 or within a python script import deeprank.map deeprank.map.generate_viz_files(mol_dir_name) A new subfolder data_viz will be created in <mol_dir_name> with all the cube files representing the features contained in the files <mol_dir_name>/input/*.npy A script called <feature_name>.vmd is also outputed et allow for quick vizualisation of the data by typing vmd -e <feature_name>.vmd ''' outdir = out if outdir is None: outdir = mol_name if outdir[-1] != '/': outdir = outdir + '/' if not os.path.isdir(outdir): os.mkdir(outdir) try: f5 = h5py.File(hdf5, 'r') except: raise FileNotFoundError('HDF5 file %s could not be opened' % hdf5) try: molgrp = f5[mol_name] except: raise LookupError('Molecule %s not found in %s' % (mol_name, hdf5)) # create the pdb file sqldb = pdb2sql(molgrp['complex'].value) sqldb.exportpdb(outdir + '/complex.pdb') sqldb.close() # get the grid grid = {} grid['x'] = molgrp['grid_points/x'].value grid['y'] = molgrp['grid_points/y'].value grid['z'] = molgrp['grid_points/z'].value shape = (len(grid['x']), len(grid['y']), len(grid['z'])) # deals with the features mapgrp = molgrp['mapped_features'] # loop through all the features for data_name in mapgrp.keys(): # create a dict of the feature {name : value} featgrp = mapgrp[data_name] data_dict = {} for ff in featgrp.keys(): subgrp = featgrp[ff] if not subgrp.attrs['sparse']: data_dict[ff] = subgrp['value'].value else: spg = sparse.FLANgrid(sparse=True, index=subgrp['index'].value, value=subgrp['value'].value, shape=shape) data_dict[ff] = spg.to_dense() # export the cube file export_cube_files(data_dict, data_name, grid, outdir) f5.close()
def _extract_data(self): """Extract the data from the different maps.""" f5 = h5py.File(self.fname, 'r') mol_names = list(f5.keys()) self.nmol = len(mol_names) # loop over the molecules for mol in mol_names: # get the mapped features group data_group = f5.get(mol + '/mapped_features/') # loop over all the feature types for feat_types, feat_names in data_group.items(): # if feature type not in param add if feat_types not in self.parameters['features']: self.parameters['features'][feat_types] = {} # loop over all the feature for name in feat_names: # we skip the target if name in self.skip_feature: continue # create the param if it doesn't already exists if name not in self.parameters['features'][feat_types]: self.parameters['features'][feat_types][ name] = NormParam() # load the matrix feat_data = data_group[feat_types + '/' + name] if feat_data.attrs['sparse']: mat = sparse.FLANgrid(sparse=True, index=feat_data['index'][:], value=feat_data['value'][:], shape=self.shape).to_dense() else: mat = feat_data['value'][:] # add the parameter (mean and var) self.parameters['features'][feat_types][name].add( np.mean(mat), np.var(mat)) # get the target groups target_group = f5.get(mol + '/targets') # loop over all the targets for tname, tval in target_group.items(): # we skip the already computed target if tname in self.skip_target: continue # create a new item if needed if tname not in self.parameters['targets']: self.parameters['targets'][tname] = MinMaxParam() # update the value self.parameters['targets'][tname].update(tval[()]) f5.close()
def load_one_molecule(self, fname, mol=None): '''Load the feature/target of a single molecule. Args: fname (str): hdf5 file name mol (None or str, optional): name of the complex in the hdf5 Returns: np.array,float: features, targets ''' outtype = 'float32' fh5 = h5py.File(fname, 'r') if mol is None: mol = list(fh5.keys())[0] # get the mol mol_data = fh5.get(mol) # get the features feature = [] for feat_type, feat_names in self.select_feature.items(): # see if the feature exists try: feat_dict = mol_data.get('mapped_features/' + feat_type) except KeyError: print('Feature type %s not found in file %s for molecule %s' % (feat_type, fname, mol)) print('Possible feature types are : ' + '\n\t'.join(list(mol_data['mapped_features'].keys()))) # loop through all the desired feat names for name in feat_names: # extract the group try: data = feat_dict[name] except KeyError: print( 'Feature %s not found in file %s for mol %s and feature type %s' % (name, fname, mol, feat_type)) print('Possible feature are : ' + '\n\t'.join( list(mol_data['mapped_features/' + feat_type].keys()))) # check its sparse attribute # if true get a FLAN # if flase direct import if data.attrs['sparse']: mat = sparse.FLANgrid(sparse=True, index=data['index'][:], value=data['value'][:], shape=self.grid_shape).to_dense() else: mat = data['value'][:] # append to the list of features feature.append(mat) # get the target value target = mol_data.get('targets/' + self.select_target)[()] # close fh5.close() # make sure all the feature have exact same type # if they don't collate_fn in the creation of the minibatch will fail. # Note returning torch.FloatTensor makes each epoch twice longer ... return np.array(feature).astype(outtype), np.array([target ]).astype(outtype)