def build_ligand_properties(self): lp = LigandProperities() lt = LigandType.objects.get(name = 'small molecule') lp.ligand_type = lt lp.smiles = None lp.inchikey = None lp.sequence= None lp.mw = None lp.rotatable_bonds = None lp.hacc = None lp.hdon = None lp.logp = None lp.save() self.logger.info("Could not create ligand, empty is returned") return lp
def get_or_make_ligand(ligand_id, type_id, name=None): if type_id == 'PubChem CID' or type_id == 'SMILES': if type_id == 'PubChem CID': pubchem_lookup_value = 'cid' elif type_id == 'SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if name: ligand_name = name else: ligand_name = False try: # if this name is canonical and it has a ligand record already if (ligand_name == False): l = None ls = Ligand.objects.filter( canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) for ligand in ls: l = ligand #print (l) break if l == None: l = Ligand.objects.get( canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) else: l = Ligand.objects.get( name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) #l = Ligand.objects.get(name=ligand_name, canonical=True, # properities__web_links__web_resource=web_resource, # properities__web_links__index=ligand_id) # except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get( properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id, canonical=True) #print (created) try: l, created = Ligand.objects.get_or_create( properities=l_canonical.properities, name=ligand_name, canonical=False) except IntegrityError: l = Ligand.objects.get(properities=l_canonical.properities, name=ligand_name, canonical=False) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() #print (ligand_name) l = l.load_from_pubchem(pubchem_lookup_value, ligand_id, lt, ligand_name) #print (l) if l == None and type_id == 'SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__smiles=ligand_id) except Ligand.DoesNotExist: try: l = Ligand.objects.get( name__startswith=ligand_name, canonical=True, properities__smiles=ligand_id ) #if no properities exist except Ligand.DoesNotExist: try: l = Ligand.objects.get( name=ligand_name, canonical=True, properities__smiles=None ) #if no properities exist l.properities.smiles = ligand_id l.properities.save() l.save() except Ligand.DoesNotExist: ## now insert a new ligand, but first make sure name is unique if Ligand.objects.filter( name=ligand_name).exists(): ls = Ligand.objects.filter( name__startswith=ligand_name, canonical=True).order_by("pk") for l_temp in ls: last = l_temp.name.split("_")[-1] if last == ligand_name: #no addition yet ligand_name = ligand_name + "_1" else: ligand_name = ligand_name + "_" + str( int(last) + 1) l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = ligand_id lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False try: l.save() except IntegrityError: l = Ligand.objects.get(name=ligand_name, canonical=True) elif name: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=name, canonical=True).exists(): l = Ligand.objects.get(name=name, canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=name, canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = name l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(name) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(name) l.canonical = True l.ambigious_alias = False try: l.save() l.load_by_name(str(name)) except IntegrityError: l = Ligand.objects.get(name=str(name), canonical=True) else: l = None return l
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join([self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:]=='xlsx' or source_file[-3:]=='xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:]=='yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__","_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to'])>1 or len(d['mutation_from'])>1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c%1000==0: self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type']=='PubChem CID' or r['ligand_type']=='SMILES': if r['ligand_type']=='PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type']=='SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get(name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create(properities = l_canonical.properities, name = ligand_name, canonical = False) if created: self.logger.info('Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem(pubchem_lookup_value, r['ligand_id'], lt, ligand_name) if l == None and r['ligand_type']=='SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__smiles=r['ligand_id']) except Ligand.DoesNotExist: l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = r['ligand_id'] lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False l.save() self.logger.info('Created Ligand {} manually'.format(l.name)) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, raw = raw_experiment, optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) mut_id = obj.id inserted += 1 self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1),reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def get_or_make_ligand(ligand_id,type_id, name = None): if type_id=='PubChem CID' or type_id=='SMILES': if type_id=='PubChem CID': pubchem_lookup_value = 'cid' elif type_id=='SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if name: ligand_name = name else: ligand_name = False try: # if this name is canonical and it has a ligand record already if (ligand_name==False): l = None ls = Ligand.objects.filter(canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) for ligand in ls: l = ligand #print (l) break if l == None: l = Ligand.objects.get(canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) else: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) #l = Ligand.objects.get(name=ligand_name, canonical=True, # properities__web_links__web_resource=web_resource, # properities__web_links__index=ligand_id) # except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id, canonical=True) #print (created) try: l, created = Ligand.objects.get_or_create(properities = l_canonical.properities, name = ligand_name, canonical = False) except IntegrityError: l = Ligand.objects.get(properities = l_canonical.properities, name = ligand_name, canonical = False) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() #print (ligand_name) l = l.load_from_pubchem(pubchem_lookup_value, ligand_id, lt, ligand_name) #print (l) if l == None and type_id=='SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__smiles=ligand_id) except Ligand.DoesNotExist: try: l = Ligand.objects.get(name__startswith=ligand_name, canonical=True,properities__smiles=ligand_id) #if no properities exist except Ligand.DoesNotExist: try: l = Ligand.objects.get(name=ligand_name, canonical=True,properities__smiles=None) #if no properities exist l.properities.smiles = ligand_id l.properities.save() l.save() except Ligand.DoesNotExist: ## now insert a new ligand, but first make sure name is unique if Ligand.objects.filter(name=ligand_name).exists(): ls = Ligand.objects.filter(name__startswith=ligand_name, canonical=True).order_by("pk") for l_temp in ls: last = l_temp.name.split("_")[-1] if last==ligand_name: #no addition yet ligand_name = ligand_name +"_1" else: ligand_name = ligand_name +"_"+str(int(last)+1) l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = ligand_id lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False try: l.save() except IntegrityError: l = Ligand.objects.get(name=ligand_name, canonical=True) elif name: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=name, canonical=True).exists(): l = Ligand.objects.get(name=name, canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=name, canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = name l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(name) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(name) l.canonical = True l.ambigious_alias = False try: l.save() l.load_by_name(str(name)) except IntegrityError: l = Ligand.objects.get(name=str(name), canonical=True) else: l = None return l
def main_func(self, positions, iteration, count, lock): # print(positions,iteration,count,lock) ligands = self.ligand_dump while count.value < len(ligands): with lock: l = ligands[count.value] count.value += 1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(ligands))) if 'logp' not in l: # temp skip to only use "full" annotated ligands continue lp = LigandProperities.objects.filter( inchikey=l['inchikey']).first() ligand = None if lp: # Check if inchikey is there ligand = Ligand.objects.filter( name=l['name'], properities=lp).prefetch_related( 'properities__ligand_type', 'properities__web_links', 'properities__vendors').first() # The name with corresponding inchikey is there, assume all is good and skip. # Will add links to make sure they're there. if not ligand: if lp: print(l['name'], 'is there! (but not by name, only inchi') ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() else: # No ligand seems to match by inchikey -- start creating it. # Make LigandProperities first lt, created = LigandType.objects.get_or_create( slug=l['ligand_type__slug'], defaults={'name': l['ligand_type__name']}) lp = LigandProperities() lp.inchikey = l['inchikey'] lp.smiles = l['smiles'] lp.mw = l['mw'] lp.logp = l['logp'] lp.rotatable_bonds = l['rotatable_bonds'] lp.hacc = l['hacc'] lp.hdon = l['hdon'] lp.ligand_type = lt lp.save() ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() # create links - impossible to make duplicates so no need to check if there already if ligand.properities.web_links.count() < len(l['web_links']): for link in l['web_links']: wr = WebResource.objects.get(slug=link['web_resource']) wl, created = WebLink.objects.get_or_create( index=link['index'], web_resource=wr) ligand.properities.web_links.add(wl) # create vendors - impossible to make duplicates so no need to check if there already if ligand.properities.vendors.count() < len(l['vendors']): for link in l['vendors']: lv = LigandVendors.objects.get(slug=link['vendor_slug']) check = LigandVendorLink.objects.filter( sid=link['sid']).exists() if not check: lvl = LigandVendorLink() lvl.sid = link['sid'] lvl.vendor = lv lvl.lp = ligand.properities lvl.vendor_external_id = link['vendor_external_id'] lvl.url = link['url'] lvl.save()
def main_func(self, positions, iteration,count,lock): # print(positions,iteration,count,lock) ligands = self.ligand_dump while count.value<len(ligands): with lock: l = ligands[count.value] count.value +=1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(ligands))) if 'logp' not in l: # temp skip to only use "full" annotated ligands continue lp = LigandProperities.objects.filter(inchikey=l['inchikey']).first() ligand = None if lp: # Check if inchikey is there ligand = Ligand.objects.filter(name=l['name'], properities=lp).prefetch_related('properities__ligand_type','properities__web_links','properities__vendors').first() # The name with corresponding inchikey is there, assume all is good and skip. # Will add links to make sure they're there. if not ligand: if lp: print(l['name'],'is there! (but not by name, only inchi') ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() else: # No ligand seems to match by inchikey -- start creating it. # Make LigandProperities first lt, created = LigandType.objects.get_or_create(slug=l['ligand_type__slug'],defaults = {'name':l['ligand_type__name']}) lp = LigandProperities() lp.inchikey = l['inchikey'] lp.smiles = l['smiles'] lp.mw = l['mw'] lp.logp = l['logp'] lp.rotatable_bonds = l['rotatable_bonds'] lp.hacc = l['hacc'] lp.hdon = l['hdon'] lp.ligand_type = lt lp.save() ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() # create links - impossible to make duplicates so no need to check if there already if ligand.properities.web_links.count()<len(l['web_links']): for link in l['web_links']: wr = WebResource.objects.get(slug=link['web_resource']) wl, created = WebLink.objects.get_or_create(index=link['index'], web_resource=wr) ligand.properities.web_links.add(wl) # create vendors - impossible to make duplicates so no need to check if there already if ligand.properities.vendors.count()<len(l['vendors']): for link in l['vendors']: lv = LigandVendors.objects.get(slug = link['vendor_slug']) check = LigandVendorLink.objects.filter(sid=link['sid']).exists() if not check: lvl = LigandVendorLink() lvl.sid = link['sid'] lvl.vendor = lv lvl.lp = ligand.properities lvl.vendor_external_id = link['vendor_external_id'] lvl.url = link['url'] lvl.save()
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join( [self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:] == 'xlsx' or source_file[-3:] == 'xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:] == 'yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__", "_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to']) > 1 or len( d['mutation_from'] ) > 1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c % 1000 == 0: self.logger.info('Parsed ' + str(c) + ' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type'] == 'PubChem CID' or r[ 'ligand_type'] == 'SMILES': if r['ligand_type'] == 'PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type'] == 'SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get( slug='pubchem') except: # abort if pdb resource is not found raise Exception( 'PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get( name=ligand_name, canonical=True, properities__web_links__web_resource= web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get( properities__web_links__web_resource= web_resource, properities__web_links__index=r[ 'ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create( properities=l_canonical.properities, name=ligand_name, canonical=False) if created: self.logger.info( 'Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem( pubchem_lookup_value, r['ligand_id'], lt, ligand_name) if l == None and r[ 'ligand_type'] == 'SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get( name=ligand_name, canonical=True, properities__smiles=r['ligand_id']) except Ligand.DoesNotExist: l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = r['ligand_id'] lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False l.save() self.logger.info( 'Created Ligand {} manually'. format(l.name)) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos'] ) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error( 'Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r[ 'exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r[ 'opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, raw=raw_experiment, optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) mut_id = obj.id inserted += 1 self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1), reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')