def __init__(self , antigen , antibody): OrderedDefaultDict.__init__(self , lambda :list([0] * 60)) self.antigen = antigen self.antibody = antibody self.nearby_relation = defaultdict(dict)#for cache self.nearby_reses_in_antigen = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antigen for each residue self.nearby_reses_in_antibody = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antibody for each residue self.fp_rule = {#property id and the corresponding residue code 0 : ['TYR', 'ASN', 'GLU', 'SER', 'CYS', 'THR', 'GLY'], #polar 1 : ['PHE', 'LEU', 'ILE', 'TRP', 'VAL', 'MET', 'PRO', 'ALA'], #hydrop 2 : ['ARG', 'ASP', 'GLU', 'LYS', 'HIS'], #charged 3 : ['ALA', 'VAL', 'LEU', 'ILE', 'MET', 'ASN', 'GLU', 'LYS',\ 'ARG', 'GLY', 'SER', 'THR', 'CYS', 'ASP', 'PHE'], #lipids 4 : ['PHE', 'TYR', 'TRP'], #aromatic 5 : ['PRO','HIS'], #heterocyclic }#the key represents the group index, value for the residue code self.res_prop_ids = defaultdict(list)#the property ids that a given residue has #we need to do some conversion for fp_rule for better performance print "initializing FingerPrint_60 object" for prop_id , residues in self.fp_rule.items(): for res_code in residues: self.res_prop_ids[res_code].append(prop_id) #print "res_prop_ids",self.res_prop_ids self.atom_dist_cutoff = 4.0 self.dist_group_cache = defaultdict(dict)
def __init__(self,res, bitlength, values = None): """(Residue, int, dict or list) => BaseResidueFingerprint""" OrderedDefaultDict.__init__(self,float) self.bitlength = bitlength self.min_idx = 0 self.max_idx = self.min_idx + bitlength self.res = res if values is not None: #if it's dict if isinstance(values, dict): self.set_val(values) #if it's list elif isinstance(values, list): self.set_val(OrderedDict(enumerate(values))) else: raise ValueError("invalid values type, either dict or list")
def get_manual_groups(group_id = "157" ): print "generating manual classification and grouping" pdb_names = [] for fname in glob.glob(pdb_src): complex_id = os.path.basename(fname).split('.')[0] pdb_names.append(complex_id.strip()) #print pdb_names,len(pdb_names) pdb_fp = os.path.join(data_root , 'manual_classification_result/%s_pdbname.txt' %group_id) type_fp = os.path.join(data_root , 'manual_classification_result/%s_type.txt' %group_id) class_d = OrderedDefaultDict(list) for name,c_type in zip(open(pdb_fp).readlines(),\ open(type_fp).readlines()): name = '_'.join(name.strip().split()) c_type = c_type.strip() if name and c_type and name in pdb_names:#not empty line class_d[c_type].append(name) return class_d.values()
class FingerPrint_60(OrderedDefaultDict): def __init__(self , antigen , antibody): OrderedDefaultDict.__init__(self , lambda :list([0] * 60)) self.antigen = antigen self.antibody = antibody self.nearby_relation = defaultdict(dict)#for cache self.nearby_reses_in_antigen = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antigen for each residue self.nearby_reses_in_antibody = OrderedDefaultDict(lambda : defaultdict(list))#surrouding residues in antibody for each residue self.fp_rule = {#property id and the corresponding residue code 0 : ['TYR', 'ASN', 'GLU', 'SER', 'CYS', 'THR', 'GLY'], #polar 1 : ['PHE', 'LEU', 'ILE', 'TRP', 'VAL', 'MET', 'PRO', 'ALA'], #hydrop 2 : ['ARG', 'ASP', 'GLU', 'LYS', 'HIS'], #charged 3 : ['ALA', 'VAL', 'LEU', 'ILE', 'MET', 'ASN', 'GLU', 'LYS',\ 'ARG', 'GLY', 'SER', 'THR', 'CYS', 'ASP', 'PHE'], #lipids 4 : ['PHE', 'TYR', 'TRP'], #aromatic 5 : ['PRO','HIS'], #heterocyclic }#the key represents the group index, value for the residue code self.res_prop_ids = defaultdict(list)#the property ids that a given residue has #we need to do some conversion for fp_rule for better performance print "initializing FingerPrint_60 object" for prop_id , residues in self.fp_rule.items(): for res_code in residues: self.res_prop_ids[res_code].append(prop_id) #print "res_prop_ids",self.res_prop_ids self.atom_dist_cutoff = 4.0 self.dist_group_cache = defaultdict(dict) def residue_nearby_enough(self,res1 , res2): """ determine whether two atoms are nearby enough given the `atom_dist_cutoff` """ def atom_distance(atom1 , atom2):#the distance between two atoms diff = np.matrix( np.array(atom1.xyz) - np.array(atom2.xyz)) return np.sqrt(( diff * diff.T ).sum()) return self._res_distance(res1,res2) <= self.atom_dist_cutoff """ if self.nearby_relation[res1].has_key(res2):#if it has been computed return self.nearby_relation[res1][res2] for atom1 in res1.atom: for atom2 in res2.atom: if atom_distance(atom1 , atom2) <= self.atom_dist_cutoff: self.nearby_relation[res1][res2] = True#cache the result self.nearby_relation[res2][res1] = True#the symetrical case return True self.nearby_relation[res1][res2] = False#cache the result self.nearby_relation[res2][res1] = False#the symetrical case return False """ def _res_distance(self,res1,res2): """residues distance """ diff = np.matrix( np.average([atom.xyz for atom in res1.atom],axis = 0) - \ np.average([atom.xyz for atom in res2.atom],axis = 0) ) return np.sqrt(( diff * diff.T ).sum()) def _get_dist_group(self, dist, bound_list = [4. , 8. , 12. , 16. , 20.]): """get the group index it should belong to according to the distance """ for level,upper_bound in enumerate(bound_list): #print upper_bound,dist if dist <= upper_bound: return level #not in the surrounding return -1 def _init_workers(self,w_count): """init workers preparing for parallel computing""" self.workers = [] self.task_queue = Queue() for i in xrange(w_count): worker = GroupingWorker(self.task_queue) self.workers.append(worker) worker.start() def _is_dist_group_cached(self,res1,res2): """check if the group dist info is caculated already""" if self.dist_group_cache[res1.resnum].has_key(res2.resnum): return True else: return False def _get_dist_group_from_cache(self,res1,res2): """as the function name indicates""" return self.dist_group_cache[res1.resnum][res2.resnum] def _cache_dist_group(self,res1,res2,dist_group): """cache the fruit""" self.dist_group_cache[res1.resnum][res2.resnum] = dist_group self.dist_group_cache[res2.resnum][res1.resnum] = dist_group def grouping_residue_by_distance(self): """iterate every residue in the complex and group their surrounding residues by distance""" print "grouping antigen side,total count: %d" %(len(self.antigen.residue)) #grouping the residues in antigen #assign tasks count = 0 hit_count = 0 miss_count = 0 tmp = defaultdict(dict) for res1 in self.antigen.residue: for res2 in chain(self.antigen.residue, self.antibody.residue): if res1.resnum is res2.resnum:continue if self._is_dist_group_cached(res1,res2):#it is computing already print "hit" dist_group = self._get_dist_group_from_cache(res1,res2)#use it directly print res1.resnum, res2.resnum self.nearby_reses_in_antigen[res1][dist_group].append(res2)#updating the group list hit_count += 1 else:#it is new, we need to start from scratch if self.residue_nearby_enough(res1 , res2): dist = self._res_distance(res1 , res2 )#get the distance between res1 and res2 dist_group = self._get_dist_group(dist)#fit it into a group self.nearby_reses_in_antigen[res1][dist_group].append(res2)#updating the group list self._cache_dist_group(res1,res2,dist_group)#cache the fruit #print self.dist_group_cache miss_count += 1 count += 1 print count print hit_count,miss_count def get_fingerprint(self): if not self:#not computed print "grouping by distance" self.grouping_residue_by_distance()#first group those residues print "fisrt 30 bits started" #the first 30 bits for res , groups in self.nearby_reses_in_antigen.items(): for group_index , residues in groups.items(): for residue in residues: for prop_id in self.res_prop_ids[residue.pdbres.strip().upper()]: #increment the count of property at given position self[res][group_index * 6 + prop_id] += 1 #print "#####for residue %d" %res.resnum #print "%s" %(" ".join("%dp%d" %(g,p) for g in xrange(5) for p in xrange(6))) #print ' '.join("%2d " %count for count in self[res]) #print res.resnum , group_index , prop_id #the 30 ~ 60 bits print "second 30 bits started" for res , groups in self.nearby_reses_in_antibody.items(): for group_index , residues in groups.items(): for residue in residues: for prop_id in self.res_prop_ids[residue.pdbres.strip().upper()]: #increment the count of property at given position, offset by 30 self[res][30 + group_index * 6 + prop_id] += 1 return self def display_fingerprint(self , start = None , end = None): print "%s%s" %(' ' * 11 , " ".join("%dp%d" %(g,p) for g in xrange(5) for p in xrange(6))) for residue , fp in self.items(): if start and end: fp = fp[start:end] elif not start and end: fp = fp[:end] elif start and not end: fp = fp[start:] print "%8d : %s" %(residue.resnum , ' '.join("%2d " %count for count in fp)) def display_group_info(self): def _display_group_info(nearby_reses): for residue , groups in nearby_reses : print "%8d" %residue.resnum , for group_index , residues in groups.items(): #twisted statement, hehe! print "%d:%d(%s)" %( group_index , len(residues) ,\ ' '.join("%s(%s)" %(res.pdbres.strip().upper(),\ ','.join('%d'%prop_id for prop_id in self.res_prop_ids[res.pdbres.strip().upper()]))\ for res in residues)), print return#for clearity print "antigen part(first 30 bit)" _display_group_info(self.nearby_reses_in_antigen.items()) print "antibody part(30 ~ 60 bit)" _display_group_info(self.nearby_reses_in_antibody.items())