def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get( web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r[ 'opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, #raw = raw_experiment, #raw_experiment, OR None optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped))
def get_or_make_ligand(ligand_id, type_id, name=None): if type_id == 'PubChem CID' or type_id == 'SMILES': if type_id == 'PubChem CID': pubchem_lookup_value = 'cid' elif type_id == 'SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if name: ligand_name = name else: ligand_name = False try: # if this name is canonical and it has a ligand record already if (ligand_name == False): l = None ls = Ligand.objects.filter( canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) for ligand in ls: l = ligand #print (l) break if l == None: l = Ligand.objects.get( canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) else: l = Ligand.objects.get( name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) #l = Ligand.objects.get(name=ligand_name, canonical=True, # properities__web_links__web_resource=web_resource, # properities__web_links__index=ligand_id) # except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get( properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id, canonical=True) #print (created) try: l, created = Ligand.objects.get_or_create( properities=l_canonical.properities, name=ligand_name, canonical=False) except IntegrityError: l = Ligand.objects.get(properities=l_canonical.properities, name=ligand_name, canonical=False) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() #print (ligand_name) l = l.load_from_pubchem(pubchem_lookup_value, ligand_id, lt, ligand_name) #print (l) if l == None and type_id == 'SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__smiles=ligand_id) except Ligand.DoesNotExist: try: l = Ligand.objects.get( name__startswith=ligand_name, canonical=True, properities__smiles=ligand_id ) #if no properities exist except Ligand.DoesNotExist: try: l = Ligand.objects.get( name=ligand_name, canonical=True, properities__smiles=None ) #if no properities exist l.properities.smiles = ligand_id l.properities.save() l.save() except Ligand.DoesNotExist: ## now insert a new ligand, but first make sure name is unique if Ligand.objects.filter( name=ligand_name).exists(): ls = Ligand.objects.filter( name__startswith=ligand_name, canonical=True).order_by("pk") for l_temp in ls: last = l_temp.name.split("_")[-1] if last == ligand_name: #no addition yet ligand_name = ligand_name + "_1" else: ligand_name = ligand_name + "_" + str( int(last) + 1) l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = ligand_id lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False try: l.save() except IntegrityError: l = Ligand.objects.get(name=ligand_name, canonical=True) elif name: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=name, canonical=True).exists(): l = Ligand.objects.get(name=name, canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=name, canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = name l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(name) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(name) l.canonical = True l.ambigious_alias = False try: l.save() l.load_by_name(str(name)) except IntegrityError: l = Ligand.objects.get(name=str(name), canonical=True) else: l = None return l
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join([self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:]=='xlsx' or source_file[-3:]=='xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:]=='yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__","_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to'])>1 or len(d['mutation_from'])>1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c%1000==0: self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type']=='PubChem CID' or r['ligand_type']=='SMILES': if r['ligand_type']=='PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type']=='SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get(name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create(properities = l_canonical.properities, name = ligand_name, canonical = False) if created: self.logger.info('Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem(pubchem_lookup_value, r['ligand_id'], lt, ligand_name) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,sequence_number=r['mutation_pos']) if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue ' + r['protein'] + ' pos:'+str(r['mutation_pos'])) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, raw = raw_experiment, optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) mut_id = obj.id inserted += 1 self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1),reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def main_func(self, positions, iteration, count, lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value < len(rows): with lock: r = rows[count.value] count.value += 1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) except Exception as msg: print( 'Something errored with ligand, aborting entry of mutation', r['ligand_name'], r['ligand_type'], r['ligand_id'], r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein = Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein = Protein.objects.filter( web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein = protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml=True) try: real_uniprot = uniprot_protein.find( './/{http://uniprot.org/uniprot}name' ).text.lower() protein = Protein.objects.get( entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error( 'Skipped due to no protein ' + r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter( slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get( slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with", r['ligand_class'], slugify(r['ligand_class'])[:50]) l_role, created = LigandRole.objects.get_or_create( slug=slugify(r['ligand_class']) [:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) except: print(r) typefold = r['exp_type'] + "_log" elif "%" == r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round( r['exp_wt_value'] / r['exp_mu_value_raw'], 3) else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange > 0 and foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group=r['submitting_group'], data_container=r['data_container'], data_container_number=r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange, opt_receptor_expression=r['opt_receptor_expression'], opt_basal_activity=r['opt_basal_activity'], opt_gain_of_activity=r['opt_gain_of_activity'], opt_ligand_emax=r['opt_ligand_emax'], opt_agonist=r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) # current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True)
def get_or_make_ligand(ligand_id,type_id, name = None): if type_id=='PubChem CID' or type_id=='SMILES': if type_id=='PubChem CID': pubchem_lookup_value = 'cid' elif type_id=='SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if name: ligand_name = name else: ligand_name = False try: # if this name is canonical and it has a ligand record already if (ligand_name==False): l = None ls = Ligand.objects.filter(canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) for ligand in ls: l = ligand #print (l) break if l == None: l = Ligand.objects.get(canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) else: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) #l = Ligand.objects.get(name=ligand_name, canonical=True, # properities__web_links__web_resource=web_resource, # properities__web_links__index=ligand_id) # except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id, canonical=True) #print (created) try: l, created = Ligand.objects.get_or_create(properities = l_canonical.properities, name = ligand_name, canonical = False) except IntegrityError: l = Ligand.objects.get(properities = l_canonical.properities, name = ligand_name, canonical = False) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() #print (ligand_name) l = l.load_from_pubchem(pubchem_lookup_value, ligand_id, lt, ligand_name) #print (l) if l == None and type_id=='SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__smiles=ligand_id) except Ligand.DoesNotExist: try: l = Ligand.objects.get(name__startswith=ligand_name, canonical=True,properities__smiles=ligand_id) #if no properities exist except Ligand.DoesNotExist: try: l = Ligand.objects.get(name=ligand_name, canonical=True,properities__smiles=None) #if no properities exist l.properities.smiles = ligand_id l.properities.save() l.save() except Ligand.DoesNotExist: ## now insert a new ligand, but first make sure name is unique if Ligand.objects.filter(name=ligand_name).exists(): ls = Ligand.objects.filter(name__startswith=ligand_name, canonical=True).order_by("pk") for l_temp in ls: last = l_temp.name.split("_")[-1] if last==ligand_name: #no addition yet ligand_name = ligand_name +"_1" else: ligand_name = ligand_name +"_"+str(int(last)+1) l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = ligand_id lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False try: l.save() except IntegrityError: l = Ligand.objects.get(name=ligand_name, canonical=True) elif name: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=name, canonical=True).exists(): l = Ligand.objects.get(name=name, canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=name, canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = name l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(name) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(name) l.canonical = True l.ambigious_alias = False try: l.save() l.load_by_name(str(name)) except IntegrityError: l = Ligand.objects.get(name=str(name), canonical=True) else: l = None return l
def build_ligand_properties(self): lp = LigandProperities() lt = LigandType.objects.get(name = 'small molecule') lp.ligand_type = lt lp.smiles = None lp.inchikey = None lp.sequence= None lp.mw = None lp.rotatable_bonds = None lp.hacc = None lp.hdon = None lp.logp = None lp.save() self.logger.info("Could not create ligand, empty is returned") return lp
def main_func(self, positions, iteration, count, lock): # print(positions,iteration,count,lock) ligands = self.ligand_dump while count.value < len(ligands): with lock: l = ligands[count.value] count.value += 1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(ligands))) if 'logp' not in l: # temp skip to only use "full" annotated ligands continue lp = LigandProperities.objects.filter( inchikey=l['inchikey']).first() ligand = None if lp: # Check if inchikey is there ligand = Ligand.objects.filter( name=l['name'], properities=lp).prefetch_related( 'properities__ligand_type', 'properities__web_links', 'properities__vendors').first() # The name with corresponding inchikey is there, assume all is good and skip. # Will add links to make sure they're there. if not ligand: if lp: print(l['name'], 'is there! (but not by name, only inchi') ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() else: # No ligand seems to match by inchikey -- start creating it. # Make LigandProperities first lt, created = LigandType.objects.get_or_create( slug=l['ligand_type__slug'], defaults={'name': l['ligand_type__name']}) lp = LigandProperities() lp.inchikey = l['inchikey'] lp.smiles = l['smiles'] lp.mw = l['mw'] lp.logp = l['logp'] lp.rotatable_bonds = l['rotatable_bonds'] lp.hacc = l['hacc'] lp.hdon = l['hdon'] lp.ligand_type = lt lp.save() ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() # create links - impossible to make duplicates so no need to check if there already if ligand.properities.web_links.count() < len(l['web_links']): for link in l['web_links']: wr = WebResource.objects.get(slug=link['web_resource']) wl, created = WebLink.objects.get_or_create( index=link['index'], web_resource=wr) ligand.properities.web_links.add(wl) # create vendors - impossible to make duplicates so no need to check if there already if ligand.properities.vendors.count() < len(l['vendors']): for link in l['vendors']: lv = LigandVendors.objects.get(slug=link['vendor_slug']) check = LigandVendorLink.objects.filter( sid=link['sid']).exists() if not check: lvl = LigandVendorLink() lvl.sid = link['sid'] lvl.vendor = lv lvl.lp = ligand.properities lvl.vendor_external_id = link['vendor_external_id'] lvl.url = link['url'] lvl.save()
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join( [self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:] == 'xlsx' or source_file[-3:] == 'xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:] == 'yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__", "_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to']) > 1 or len( d['mutation_from'] ) > 1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c % 1000 == 0: self.logger.info('Parsed ' + str(c) + ' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type'] == 'PubChem CID' or r[ 'ligand_type'] == 'SMILES': if r['ligand_type'] == 'PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type'] == 'SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get( slug='pubchem') except: # abort if pdb resource is not found raise Exception( 'PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get( name=ligand_name, canonical=True, properities__web_links__web_resource= web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get( properities__web_links__web_resource= web_resource, properities__web_links__index=r[ 'ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create( properities=l_canonical.properities, name=ligand_name, canonical=False) if created: self.logger.info( 'Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem( pubchem_lookup_value, r['ligand_id'], lt, ligand_name) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, sequence_number=r['mutation_pos']) if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue ' + r['protein'] + ' pos:' + str(r['mutation_pos'])) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r[ 'exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r[ 'opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, raw=raw_experiment, optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) mut_id = obj.id inserted += 1 self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1), reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def main_func(self, positions, iteration,count,lock): # print(positions,iteration,count,lock) ligands = self.ligand_dump while count.value<len(ligands): with lock: l = ligands[count.value] count.value +=1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(ligands))) if 'logp' not in l: # temp skip to only use "full" annotated ligands continue lp = LigandProperities.objects.filter(inchikey=l['inchikey']).first() ligand = None if lp: # Check if inchikey is there ligand = Ligand.objects.filter(name=l['name'], properities=lp).prefetch_related('properities__ligand_type','properities__web_links','properities__vendors').first() # The name with corresponding inchikey is there, assume all is good and skip. # Will add links to make sure they're there. if not ligand: if lp: print(l['name'],'is there! (but not by name, only inchi') ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() else: # No ligand seems to match by inchikey -- start creating it. # Make LigandProperities first lt, created = LigandType.objects.get_or_create(slug=l['ligand_type__slug'],defaults = {'name':l['ligand_type__name']}) lp = LigandProperities() lp.inchikey = l['inchikey'] lp.smiles = l['smiles'] lp.mw = l['mw'] lp.logp = l['logp'] lp.rotatable_bonds = l['rotatable_bonds'] lp.hacc = l['hacc'] lp.hdon = l['hdon'] lp.ligand_type = lt lp.save() ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() # create links - impossible to make duplicates so no need to check if there already if ligand.properities.web_links.count()<len(l['web_links']): for link in l['web_links']: wr = WebResource.objects.get(slug=link['web_resource']) wl, created = WebLink.objects.get_or_create(index=link['index'], web_resource=wr) ligand.properities.web_links.add(wl) # create vendors - impossible to make duplicates so no need to check if there already if ligand.properities.vendors.count()<len(l['vendors']): for link in l['vendors']: lv = LigandVendors.objects.get(slug = link['vendor_slug']) check = LigandVendorLink.objects.filter(sid=link['sid']).exists() if not check: lvl = LigandVendorLink() lvl.sid = link['sid'] lvl.vendor = lv lvl.lp = ligand.properities lvl.vendor_external_id = link['vendor_external_id'] lvl.url = link['url'] lvl.save()
def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get(web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, #raw = raw_experiment, #raw_experiment, OR None optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
def main_func(self, positions, iteration,count,lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value<len(rows): with lock: r = rows[count.value] count.value +=1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) except Exception as msg: print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein=Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein=protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True) try: real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower() protein=Protein.objects.get(entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error('Skipped due to no protein '+ r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] ) l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); except: print(r) typefold = r['exp_type']+"_log" elif "%"==r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3); else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange>0 and foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group = r['submitting_group'], data_container = r['data_container'], data_container_number = r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange, opt_receptor_expression = r['opt_receptor_expression'], opt_basal_activity = r['opt_basal_activity'], opt_gain_of_activity = r['opt_gain_of_activity'], opt_ligand_emax = r['opt_ligand_emax'], opt_agonist = r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)