def create_empty_ligand(self, ligand_name): # gtoplig webresource lp = self.build_ligand_properties() ligand = Ligand() ligand.properities = lp ligand.name = ligand_name ligand.canonical = True ligand.ambigious_alias = False ligand.pdbe = None try: ligand.save() except IntegrityError: self.logger.info("empty ligand found") return Ligand.objects.get(name=ligand_name, canonical=True) return ligand
def chose_reference_from_assays(self, assays): references = list() final_assay = list() reference_ligand = Ligand() for i in reversed(assays): if (i['quantitive_activity'] and i['quantitive_activity'] is not None and i['quantitive_efficacy'] and i['quantitive_efficacy'] is not None and i['ligand'] is not None): reference_ligand=i['ligand'] reference_return = assays.copy() assay_return = assays.copy() references=self.filter_reference_assay(reference_return,reference_ligand) final_assay=self.filter_assay_reference(assay_return,reference_ligand) self.logger.info('return reference assay') return references, final_assay
def get_or_make_ligand(ligand_id, type_id, name=None): if type_id == 'PubChem CID' or type_id == 'SMILES': if type_id == 'PubChem CID': pubchem_lookup_value = 'cid' elif type_id == 'SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if name: ligand_name = name else: ligand_name = False try: # if this name is canonical and it has a ligand record already if (ligand_name == False): l = None ls = Ligand.objects.filter( canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) for ligand in ls: l = ligand #print (l) break if l == None: l = Ligand.objects.get( canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) else: l = Ligand.objects.get( name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) #l = Ligand.objects.get(name=ligand_name, canonical=True, # properities__web_links__web_resource=web_resource, # properities__web_links__index=ligand_id) # except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get( properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id, canonical=True) #print (created) try: l, created = Ligand.objects.get_or_create( properities=l_canonical.properities, name=ligand_name, canonical=False) except IntegrityError: l = Ligand.objects.get(properities=l_canonical.properities, name=ligand_name, canonical=False) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() #print (ligand_name) l = l.load_from_pubchem(pubchem_lookup_value, ligand_id, lt, ligand_name) #print (l) if l == None and type_id == 'SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__smiles=ligand_id) except Ligand.DoesNotExist: try: l = Ligand.objects.get( name__startswith=ligand_name, canonical=True, properities__smiles=ligand_id ) #if no properities exist except Ligand.DoesNotExist: try: l = Ligand.objects.get( name=ligand_name, canonical=True, properities__smiles=None ) #if no properities exist l.properities.smiles = ligand_id l.properities.save() l.save() except Ligand.DoesNotExist: ## now insert a new ligand, but first make sure name is unique if Ligand.objects.filter( name=ligand_name).exists(): ls = Ligand.objects.filter( name__startswith=ligand_name, canonical=True).order_by("pk") for l_temp in ls: last = l_temp.name.split("_")[-1] if last == ligand_name: #no addition yet ligand_name = ligand_name + "_1" else: ligand_name = ligand_name + "_" + str( int(last) + 1) l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = ligand_id lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False try: l.save() except IntegrityError: l = Ligand.objects.get(name=ligand_name, canonical=True) elif name: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=name, canonical=True).exists(): l = Ligand.objects.get(name=name, canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=name, canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = name l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(name) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(name) l.canonical = True l.ambigious_alias = False try: l.save() l.load_by_name(str(name)) except IntegrityError: l = Ligand.objects.get(name=str(name), canonical=True) else: l = None return l
def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get( web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r[ 'opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, #raw = raw_experiment, #raw_experiment, OR None optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped))
def main_func(self, positions, iteration, count, lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value < len(rows): with lock: r = rows[count.value] count.value += 1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create( index=r['review'], web_resource=WebResource.objects.get( slug=pub_type)) except IntegrityError: wl = WebLink.objects.get( index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data( index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str( r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'], r['ligand_type'], str(r['ligand_name'])) except Exception as msg: print( 'Something errored with ligand, aborting entry of mutation', r['ligand_name'], r['ligand_type'], r['ligand_id'], r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists( ): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get( name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein = Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein = Protein.objects.filter( web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein = protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml=True) try: real_uniprot = uniprot_protein.find( './/{http://uniprot.org/uniprot}name' ).text.lower() protein = Protein.objects.get( entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error( 'Skipped due to no protein ' + r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res = Residue.objects.filter( protein_conformation__protein=protein, amino_acid=r['mutation_from'], sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:' + str(r['mutation_pos']) + ' AA:' + r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter( slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get( slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with", r['ligand_class'], slugify(r['ligand_class'])[:50]) l_role, created = LigandRole.objects.get_or_create( slug=slugify(r['ligand_class']) [:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) except: print(r) typefold = r['exp_type'] + "_log" elif "%" == r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round( r['exp_wt_value'] / r['exp_mu_value_raw'], 3) else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange > 0 and foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group=r['submitting_group'], data_container=r['data_container'], data_container_number=r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange, opt_receptor_expression=r['opt_receptor_expression'], opt_basal_activity=r['opt_basal_activity'], opt_gain_of_activity=r['opt_gain_of_activity'], opt_ligand_emax=r['opt_ligand_emax'], opt_agonist=r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current, 2) #print(diff) self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i, me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current, 2) # current_sheet diff_2 = round(end - current_sheet, 2) print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped", str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True)
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join([self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:]=='xlsx' or source_file[-3:]=='xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:]=='yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__","_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to'])>1 or len(d['mutation_from'])>1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c%1000==0: self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type']=='PubChem CID' or r['ligand_type']=='SMILES': if r['ligand_type']=='PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type']=='SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get(name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=r['ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create(properities = l_canonical.properities, name = ligand_name, canonical = False) if created: self.logger.info('Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem(pubchem_lookup_value, r['ligand_id'], lt, ligand_name) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,sequence_number=r['mutation_pos']) if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue ' + r['protein'] + ' pos:'+str(r['mutation_pos'])) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, raw = raw_experiment, optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) mut_id = obj.id inserted += 1 self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1),reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def get_or_make_ligand(ligand_id,type_id, name = None): if type_id=='PubChem CID' or type_id=='SMILES': if type_id=='PubChem CID': pubchem_lookup_value = 'cid' elif type_id=='SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get(slug='pubchem') except: # abort if pdb resource is not found raise Exception('PubChem resource not found, aborting!') if name: ligand_name = name else: ligand_name = False try: # if this name is canonical and it has a ligand record already if (ligand_name==False): l = None ls = Ligand.objects.filter(canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) for ligand in ls: l = ligand #print (l) break if l == None: l = Ligand.objects.get(canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) else: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id) #l = Ligand.objects.get(name=ligand_name, canonical=True, # properities__web_links__web_resource=web_resource, # properities__web_links__index=ligand_id) # except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource, properities__web_links__index=ligand_id, canonical=True) #print (created) try: l, created = Ligand.objects.get_or_create(properities = l_canonical.properities, name = ligand_name, canonical = False) except IntegrityError: l = Ligand.objects.get(properities = l_canonical.properities, name = ligand_name, canonical = False) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() #print (ligand_name) l = l.load_from_pubchem(pubchem_lookup_value, ligand_id, lt, ligand_name) #print (l) if l == None and type_id=='SMILES': #insert manually if smiles and unfound in pubchem try: l = Ligand.objects.get(name=ligand_name, canonical=True, properities__smiles=ligand_id) except Ligand.DoesNotExist: try: l = Ligand.objects.get(name__startswith=ligand_name, canonical=True,properities__smiles=ligand_id) #if no properities exist except Ligand.DoesNotExist: try: l = Ligand.objects.get(name=ligand_name, canonical=True,properities__smiles=None) #if no properities exist l.properities.smiles = ligand_id l.properities.save() l.save() except Ligand.DoesNotExist: ## now insert a new ligand, but first make sure name is unique if Ligand.objects.filter(name=ligand_name).exists(): ls = Ligand.objects.filter(name__startswith=ligand_name, canonical=True).order_by("pk") for l_temp in ls: last = l_temp.name.split("_")[-1] if last==ligand_name: #no addition yet ligand_name = ligand_name +"_1" else: ligand_name = ligand_name +"_"+str(int(last)+1) l = Ligand() l.name = ligand_name lp = LigandProperities() lp.smiles = ligand_id lp.ligand_type = lt lp.save() l.properities = lp l.canonical = True #maybe false, but that would break stuff. l.ambigious_alias = False try: l.save() except IntegrityError: l = Ligand.objects.get(name=ligand_name, canonical=True) elif name: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=name, canonical=True).exists(): l = Ligand.objects.get(name=name, canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=name, canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter(name=name, canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = name l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(name) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(name) l.canonical = True l.ambigious_alias = False try: l.save() l.load_by_name(str(name)) except IntegrityError: l = Ligand.objects.get(name=str(name), canonical=True) else: l = None return l
def main_func(self, positions, iteration, count, lock): # print(positions,iteration,count,lock) ligands = self.ligand_dump while count.value < len(ligands): with lock: l = ligands[count.value] count.value += 1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(ligands))) if 'logp' not in l: # temp skip to only use "full" annotated ligands continue lp = LigandProperities.objects.filter( inchikey=l['inchikey']).first() ligand = None if lp: # Check if inchikey is there ligand = Ligand.objects.filter( name=l['name'], properities=lp).prefetch_related( 'properities__ligand_type', 'properities__web_links', 'properities__vendors').first() # The name with corresponding inchikey is there, assume all is good and skip. # Will add links to make sure they're there. if not ligand: if lp: print(l['name'], 'is there! (but not by name, only inchi') ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() else: # No ligand seems to match by inchikey -- start creating it. # Make LigandProperities first lt, created = LigandType.objects.get_or_create( slug=l['ligand_type__slug'], defaults={'name': l['ligand_type__name']}) lp = LigandProperities() lp.inchikey = l['inchikey'] lp.smiles = l['smiles'] lp.mw = l['mw'] lp.logp = l['logp'] lp.rotatable_bonds = l['rotatable_bonds'] lp.hacc = l['hacc'] lp.hdon = l['hdon'] lp.ligand_type = lt lp.save() ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() # create links - impossible to make duplicates so no need to check if there already if ligand.properities.web_links.count() < len(l['web_links']): for link in l['web_links']: wr = WebResource.objects.get(slug=link['web_resource']) wl, created = WebLink.objects.get_or_create( index=link['index'], web_resource=wr) ligand.properities.web_links.add(wl) # create vendors - impossible to make duplicates so no need to check if there already if ligand.properities.vendors.count() < len(l['vendors']): for link in l['vendors']: lv = LigandVendors.objects.get(slug=link['vendor_slug']) check = LigandVendorLink.objects.filter( sid=link['sid']).exists() if not check: lvl = LigandVendorLink() lvl.sid = link['sid'] lvl.vendor = lv lvl.lp = ligand.properities lvl.vendor_external_id = link['vendor_external_id'] lvl.url = link['url'] lvl.save()
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] for source_file in filenames: source_file_path = os.sep.join([self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is this a representative structure (will be used to guide structure-based alignments)? representative = False if 'representative' in sd and sd['representative']: representative = True # only process representative structures on first iteration if not representative and iteration == 1: continue # skip representative structures on second iteration if representative and iteration == 2: continue # is there a construct? if 'construct' not in sd: self.logger.error('No construct specified, skipping!') continue # does the construct exists? try: con = Protein.objects.get(entry_name=sd['construct']) except Protein.DoesNotExist: self.logger.error('Construct {} does not exists, skipping!'.format(sd['construct'])) continue # create a structure record try: s = Structure.objects.get(protein_conformation__protein=con) except Structure.DoesNotExist: s = Structure() s.representative = representative # protein state if 'state' not in sd: self.logger.warning('State not defined, using default state {}'.format( settings.DEFAULT_PROTEIN_STATE)) state = settings.DEFAULT_STATE.title() else: state = sd['state'] state_slug = slugify(state) try: ps, created = ProteinState.objects.get_or_create(slug=state_slug, defaults={'name': state}) if created: self.logger.info('Created protein state {}'.format(ps.name)) except IntegrityError: ps = ProteinState.objects.get(slug=state_slug) s.state = ps # protein conformation try: s.protein_conformation = ProteinConformation.objects.get(protein=con) except ProteinConformation.DoesNotExist: self.logger.error('Protein conformation for construct {} does not exists'.format(con)) continue if s.protein_conformation.state is not state: ProteinConformation.objects.filter(protein=con).update(state=ps) # get the PDB file and save to DB sd['pdb'] = sd['pdb'].upper() if not os.path.exists(self.pdb_data_dir): os.makedirs(self.pdb_data_dir) pdb_path = os.sep.join([self.pdb_data_dir, sd['pdb'] + '.pdb']) if not os.path.isfile(pdb_path): self.logger.info('Fetching PDB file {}'.format(sd['pdb'])) url = 'http://www.rcsb.org/pdb/files/%s.pdb' % sd['pdb'] pdbdata_raw = urlopen(url).read().decode('utf-8') with open(pdb_path, 'w') as f: f.write(pdbdata_raw) else: with open(pdb_path, 'r') as pdb_file: pdbdata_raw = pdb_file.read() pdbdata, created = PdbData.objects.get_or_create(pdb=pdbdata_raw) s.pdb_data = pdbdata # UPDATE HETSYN with its PDB reference instead + GRAB PUB DATE, PMID, DOI AND RESOLUTION hetsyn = {} hetsyn_reverse = {} for line in pdbdata_raw.splitlines(): if line.startswith('HETSYN'): m = re.match("HETSYN[\s]+([\w]{3})[\s]+(.+)",line) ### need to fix bad PDB formatting where col4 and col5 are put together for some reason -- usually seen when the id is +1000 if (m): hetsyn[m.group(2).strip()] = m.group(1).upper() hetsyn_reverse[m.group(1)] = m.group(2).strip().upper() if line.startswith('HETNAM'): m = re.match("HETNAM[\s]+([\w]{3})[\s]+(.+)",line) ### need to fix bad PDB formatting where col4 and col5 are put together for some reason -- usually seen when the id is +1000 if (m): hetsyn[m.group(2).strip()] = m.group(1).upper() hetsyn_reverse[m.group(1)] = m.group(2).strip().upper() if line.startswith('REVDAT 1'): sd['publication_date'] = line[13:22] if line.startswith('JRNL PMID'): sd['pubmed_id'] = line[19:].strip() if line.startswith('JRNL DOI'): sd['doi_id'] = line[19:].strip() if len(hetsyn) == 0: self.logger.info("PDB file contained NO hetsyn") with open(pdb_path,'r') as header: header_dict = parse_pdb_header(header) sd['publication_date'] = header_dict['release_date'] sd['resolution'] = str(header_dict['resolution']).strip() sd['structure_method'] = header_dict['structure_method'] # structure type if 'structure_method' in sd and sd['structure_method']: structure_type = sd['structure_method'].capitalize() structure_type_slug = slugify(sd['structure_method']) try: st, created = StructureType.objects.get_or_create(slug=structure_type_slug, defaults={'name': structure_type}) if created: self.logger.info('Created structure type {}'.format(st)) except IntegrityError: st = StructureType.objects.get(slug=structure_type_slug) s.structure_type = st else: self.logger.warning('No structure type specified in PDB file {}'.format(sd['pdb'])) matched = 0 if 'ligand' in sd and sd['ligand']: if isinstance(sd['ligand'], list): ligands = sd['ligand'] else: ligands = [sd['ligand']] for ligand in ligands: if 'name' in ligand: if ligand['name'].upper() in hetsyn: self.logger.info('Ligand {} matched to PDB records'.format(ligand['name'])) matched = 1 ligand['name'] = hetsyn[ligand['name'].upper()] elif ligand['name'].upper() in hetsyn_reverse: matched = 1 if matched==0 and len(hetsyn)>0: self.logger.info('No ligand names found in HET in structure {}'.format(sd['pdb'])) # REMOVE? can be used to dump structure files with updated ligands # yaml.dump(sd, open(source_file_path, 'w'), indent=4) # pdb code if 'pdb' in sd: try: web_resource = WebResource.objects.get(slug='pdb') except: # abort if pdb resource is not found raise Exception('PDB resource not found, aborting!') s.pdb_code, created = WebLink.objects.get_or_create(index=sd['pdb'], web_resource=web_resource) else: self.logger.error('PDB code not specified for structure {}, skipping!'.format(sd['pdb'])) continue # insert into plain text fields if 'preferred_chain' in sd: s.preferred_chain = sd['preferred_chain'] else: self.logger.warning('Preferred chain not specified for structure {}'.format(sd['pdb'])) if 'resolution' in sd: s.resolution = float(sd['resolution']) else: self.logger.warning('Resolution not specified for structure {}'.format(sd['pdb'])) if 'publication_date' in sd: s.publication_date = sd['publication_date'] else: self.logger.warning('Publication date not specified for structure {}'.format(sd['pdb'])) # publication try: if 'doi_id' in sd: try: s.publication = Publication.objects.get(web_link__index=sd['doi_id']) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=sd['doi_id'], web_resource__slug='doi') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=sd['doi_id'], web_resource = WebResource.objects.get(slug='doi')) p.web_link = wl p.update_from_doi(doi=sd['doi_id']) p.save() s.publication = p elif 'pubmed_id' in sd: try: s.publication = Publication.objects.get(web_link__index=sd['pubmed_id']) except Publication.DoesNotExist as e: p = Publication() try: p.web_link = WebLink.objects.get(index=sd['pubmed_id'], web_resource__slug='pubmed') except WebLink.DoesNotExist: wl = WebLink.objects.create(index=sd['pubmed_id'], web_resource = WebResource.objects.get(slug='pubmed')) p.web_link = wl p.update_from_pubmed_data(index=sd['pubmed_id']) p.save() s.publication = p except: self.logger.error('Error saving publication'.format(ps.name)) # save structure before adding M2M relations s.save() #Delete previous interaction data to prevent errors. ResidueFragmentInteraction.objects.filter(structure_ligand_pair__structure=s).delete() StructureLigandInteraction.objects.filter(structure=s).delete() #Remove previous Rotamers/Residues to prepare repopulate Fragment.objects.filter(structure=s).delete() Rotamer.objects.filter(structure=s).all().delete() Residue.objects.filter(protein_conformation=s.protein_conformation).all().delete() # endogenous ligand(s) default_ligand_type = 'Small molecule' if representative and 'endogenous_ligand' in sd and sd['endogenous_ligand']: if isinstance(sd['endogenous_ligand'], list): endogenous_ligands = sd['endogenous_ligand'] else: endogenous_ligands = [sd['endogenous_ligand']] for endogenous_ligand in endogenous_ligands: if endogenous_ligand['type']: lt, created = LigandType.objects.get_or_create(slug=slugify(endogenous_ligand['type']), defaults={'name': endogenous_ligand['type']}) else: lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) ligand = Ligand() if 'iupharId' not in endogenous_ligand: endogenous_ligand['iupharId'] = 0 ligand = ligand.load_by_gtop_id(endogenous_ligand['name'], endogenous_ligand['iupharId'], lt) try: s.protein_conformation.protein.parent.endogenous_ligands.add(ligand) except IntegrityError: self.logger.info('Endogenous ligand for protein {}, already added. Skipping.'.format( s.protein_conformation.protein.parent)) # ligands if 'ligand' in sd and sd['ligand']: if isinstance(sd['ligand'], list): ligands = sd['ligand'] else: ligands = [sd['ligand']] for ligand in ligands: l = False peptide_chain = "" if 'chain' in ligand: peptide_chain = ligand['chain'] ligand['name'] = 'pep' if ligand['name'] and ligand['name'] != 'None': # some inserted as none. # use annoted ligand type or default type if ligand['type']: lt, created = LigandType.objects.get_or_create(slug=slugify(ligand['type']), defaults={'name': ligand['type']}) else: lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) # set pdb reference for structure-ligand interaction pdb_reference = ligand['name'] # use pubchem_id if 'pubchemId' in ligand and ligand['pubchemId'] and ligand['pubchemId'] != 'None': # create ligand l = Ligand() # update ligand by pubchem id ligand_title = False if 'title' in ligand and ligand['title']: ligand_title = ligand['title'] l = l.load_from_pubchem('cid', ligand['pubchemId'], lt, ligand_title) # if no pubchem id is specified, use name else: # use ligand title, if specified if 'title' in ligand and ligand['title']: ligand['name'] = ligand['title'] # create empty properties lp = LigandProperities.objects.create() # create the ligand try: l, created = Ligand.objects.get_or_create(name=ligand['name'], canonical=True, defaults={'properities': lp, 'ambigious_alias': False}) if created: self.logger.info('Created ligand {}'.format(ligand['name'])) else: pass except IntegrityError: l = Ligand.objects.get(name=ligand['name'], canonical=True) # save ligand l.save() else: continue # structure-ligand interaction if l and ligand['role']: role_slug = slugify(ligand['role']) try: lr, created = LigandRole.objects.get_or_create(slug=role_slug, defaults={'name': ligand['role']}) if created: self.logger.info('Created ligand role {}'.format(ligand['role'])) except IntegrityError: lr = LigandRole.objects.get(slug=role_slug) i, created = StructureLigandInteraction.objects.get_or_create(structure=s, ligand=l, ligand_role=lr, annotated=True, defaults={'pdb_reference': pdb_reference}) if i.pdb_reference != pdb_reference: i.pdb_reference = pdb_reference i.save() # structure segments if 'segments' in sd and sd['segments']: for segment, positions in sd['segments'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue struct_seg, created = StructureSegment.objects.update_or_create(structure=s, protein_segment=protein_segment, defaults={'start': positions[0], 'end': positions[1]}) # all representive structures should have defined segments elif representative: self.logger.warning('Segments not defined for representative structure {}'.format(sd['pdb'])) # structure segments for modeling if 'segments_in_structure' in sd and sd['segments_in_structure']: for segment, positions in sd['segments_in_structure'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue struct_seg_mod, created = StructureSegmentModeling.objects.update_or_create(structure=s, protein_segment=protein_segment, defaults={'start': positions[0], 'end': positions[1]}) # structure coordinates if 'coordinates' in sd and sd['coordinates']: for segment, coordinates in sd['coordinates'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue # fetch (create if needed) coordinates description try: description, created = StructureCoordinatesDescription.objects.get_or_create( text=coordinates) if created: self.logger.info('Created structure coordinate description {}'.format(coordinates)) except IntegrityError: description = StructureCoordinatesDescription.objects.get(text=coordinates) sc = StructureCoordinates() sc.structure = s sc.protein_segment = protein_segment sc.description = description sc.save() # structure engineering if 'engineering' in sd and sd['engineering']: for segment, engineering in sd['engineering'].items(): # fetch (create if needed) sequence segment try: protein_segment = ProteinSegment.objects.get(slug=segment) except ProteinSegment.DoesNotExist: self.logger.error('Segment {} not found'.format(segment)) continue # fetch (create if needed) engineering description try: description, created = StructureEngineeringDescription.objects.get_or_create( text=engineering) if created: self.logger.info('Created structure coordinate description {}'.format(engineering)) except IntegrityError: description = StructureEngineeringDescription.objects.get(text=engineering) se = StructureEngineering() se.structure = s se.protein_segment = protein_segment se.description = description se.save() # protein anomalies scheme = s.protein_conformation.protein.residue_numbering_scheme if 'bulges' in sd and sd['bulges']: pa_slug = 'bulge' try: pab, created = ProteinAnomalyType.objects.get_or_create(slug=pa_slug, defaults={ 'name': 'Bulge'}) if created: self.logger.info('Created protein anomaly type {}'.format(pab)) except IntegrityError: pab = ProteinAnomalyType.objects.get(slug=pa_slug) for segment, bulges in sd['bulges'].items(): for bulge in bulges: try: gn, created = ResidueGenericNumber.objects.get_or_create(label=bulge, scheme=scheme, defaults={'protein_segment': ProteinSegment.objects.get( slug=segment)}) if created: self.logger.info('Created generic number {}'.format(gn)) except IntegrityError: gn = ResidueGenericNumber.objects.get(label=bulge, scheme=scheme) try: pa, created = ProteinAnomaly.objects.get_or_create(anomaly_type=pab, generic_number=gn) if created: self.logger.info('Created protein anomaly {}'.format(pa)) except IntegrityError: pa, created = ProteinAnomaly.objects.get(anomaly_type=pab, generic_number=gn) s.protein_anomalies.add(pa) if 'constrictions' in sd and sd['constrictions']: pa_slug = 'constriction' try: pac, created = ProteinAnomalyType.objects.get_or_create(slug=pa_slug, defaults={ 'name': 'Constriction'}) if created: self.logger.info('Created protein anomaly type {}'.format(pac)) except IntegrityError: pac = ProteinAnomalyType.objects.get(slug=pa_slug) for segment, constrictions in sd['constrictions'].items(): for constriction in constrictions: try: gn, created = ResidueGenericNumber.objects.get_or_create(label=constriction, scheme=scheme, defaults={'protein_segment': ProteinSegment.objects.get( slug=segment)}) if created: self.logger.info('Created generic number {}'.format(gn)) except IntegrityError: gn = ResidueGenericNumber.objects.get(label=constriction, scheme=scheme) try: pa, created = ProteinAnomaly.objects.get_or_create(anomaly_type=pac, generic_number=gn) if created: self.logger.info('Created protein anomaly {}'.format(pa)) except IntegrityError: pa, created = ProteinAnomaly.objects.get(anomaly_type=pac, generic_number=gn) s.protein_anomalies.add(pa) # stabilizing agents, FIXME - redesign this! # fusion proteins moved to constructs, use this for G-proteins and other agents? aux_proteins = [] if 'signaling_protein' in sd and sd['signaling_protein'] and sd['signaling_protein'] != 'None': aux_proteins.append('signaling_protein') if 'auxiliary_protein' in sd and sd['auxiliary_protein'] and sd['auxiliary_protein'] != 'None': aux_proteins.append('auxiliary_protein') for index in aux_proteins: if isinstance(sd[index], list): aps = sd[index] else: aps = [sd[index]] for aux_protein in aps: aux_protein_slug = slugify(aux_protein)[:50] try: sa, created = StructureStabilizingAgent.objects.get_or_create( slug=aux_protein_slug, defaults={'name': aux_protein}) except IntegrityError: sa = StructureStabilizingAgent.objects.get(slug=aux_protein_slug) s.stabilizing_agents.add(sa) # save structure s.save() self.logger.info('Calculate rotamers / residues') self.create_rotamers(s,pdb_path) self.logger.info('Calculate interactions') #Should not error anymore. If it does, fix. runcalculation(sd['pdb'],peptide_chain) parsecalculation(sd['pdb'],False)
def create_mutant_data(self, filenames): self.logger.info('CREATING MUTANT DATA') # what files should be parsed? if not filenames: filenames = os.listdir(self.structure_data_dir) missing_proteins = {} mutants_for_proteins = {} for source_file in filenames: source_file_path = os.sep.join( [self.structure_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file if source_file[-4:] == 'xlsx' or source_file[-3:] == 'xls': rows = self.loaddatafromexcel(source_file_path) rows = self.analyse_rows(rows) elif source_file[-4:] == 'yaml': rows = yaml.load(open(source_file_path, 'r')) temp = [] for r in rows: d = {} d['reference'] = r['pubmed'] d['protein'] = r['entry_name'].replace("__", "_").lower() d['mutation_pos'] = r['seq'] d['mutation_from'] = r['from_res'] d['mutation_to'] = r['to_res'] d['ligand_name'] = '' d['ligand_type'] = '' d['ligand_id'] = '' d['ligand_class'] = '' d['exp_type'] = '' d['exp_func'] = '' d['exp_wt_value'] = 0 d['exp_wt_unit'] = '' d['exp_mu_effect_sign'] = '' d['exp_mu_value_raw'] = 0 d['fold_effect'] = 0 d['exp_mu_effect_qual'] = '' d['exp_mu_effect_ligand_prop'] = '' d['exp_mu_ligand_ref'] = '' d['opt_type'] = '' d['opt_wt'] = 0 d['opt_mu'] = 0 d['opt_sign'] = '' d['opt_percentage'] = 0 d['opt_qual'] = '' d['opt_agonist'] = '' if len(d['mutation_to']) > 1 or len( d['mutation_from'] ) > 1: #if something is off with amino acid continue temp.append(d) rows = temp else: self.logger.info('unknown format'.source_file) continue c = 0 skipped = 0 inserted = 0 for r in rows: c += 1 if c % 1000 == 0: self.logger.info('Parsed ' + str(c) + ' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' try: pub = Publication.objects.get( web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get( index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create( index=r['reference'], web_resource=WebResource.objects.get( slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. if r['ligand_type'] == 'PubChem CID' or r[ 'ligand_type'] == 'SMILES': if r['ligand_type'] == 'PubChem CID': pubchem_lookup_value = 'cid' elif r['ligand_type'] == 'SMILES': pubchem_lookup_value = 'smiles' try: web_resource = WebResource.objects.get( slug='pubchem') except: # abort if pdb resource is not found raise Exception( 'PubChem resource not found, aborting!') if 'ligand_name' in r and r['ligand_name']: ligand_name = str(r['ligand_name']) else: ligand_name = False try: # if this name is canonical and it has a ligand record already l = Ligand.objects.get( name=ligand_name, canonical=True, properities__web_links__web_resource= web_resource, properities__web_links__index=r['ligand_id']) except Ligand.DoesNotExist: try: # if exists under different name l_canonical = Ligand.objects.get( properities__web_links__web_resource= web_resource, properities__web_links__index=r[ 'ligand_id'], canonical=True) l, created = Ligand.objects.get_or_create( properities=l_canonical.properities, name=ligand_name, canonical=False) if created: self.logger.info( 'Created ligand {}'.format(l.name)) except Ligand.DoesNotExist: # fetch ligand from pubchem default_ligand_type = 'Small molecule' lt, created = LigandType.objects.get_or_create( slug=slugify(default_ligand_type), defaults={'name': default_ligand_type}) l = Ligand() l = l.load_from_pubchem( pubchem_lookup_value, r['ligand_id'], lt, ligand_name) elif r['ligand_name']: # if this name is canonical and it has a ligand record already if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=True) # if this matches an alias that only has "one" parent canonical name - eg distinct elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=False).exists(): l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False) # if this matches an alias that only has several canonical parents, must investigate, start # with empty. elif Ligand.objects.filter( name=r['ligand_name'], canonical=False, ambigious_alias=True).exists(): lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = r['ligand_name'] l.canonical = False l.ambigious_alias = True l.save() l.load_by_name(r['ligand_name']) # if neither a canonical or alias exists, create the records. Remember to check for # canonical / alias status. else: lp = LigandProperities() lp.save() l = Ligand() l.properities = lp l.name = str(r['ligand_name']) l.canonical = True l.ambigious_alias = False l.save() l.load_by_name(str(r['ligand_name'])) else: l = None if Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=True ).exists( ): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False ).exists( ): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter( name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True ).exists( ): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() else: l_ref = None protein_id = 0 residue_id = 0 protein = Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein = protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein ' + r['protein']) continue res = Residue.objects.filter( protein_conformation__protein=protein, sequence_number=r['mutation_pos']) if res.exists(): res = res.get() else: self.logger.error('Skipped due to no residue ' + r['protein'] + ' pos:' + str(r['mutation_pos'])) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create( name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50] }) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create( type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create( func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r[ 'exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create( qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[ 'opt_sign'] or r['opt_percentage'] or r[ 'opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create( type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None mutation, created = Mutation.objects.get_or_create( amino_acid=r['mutation_to'], protein=protein, residue=res) logtypes = ['pEC50', 'pIC50', 'pK'] foldchange = 0 typefold = '' if r['exp_wt_value'] != 0 and r[ 'exp_mu_value_raw'] != 0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round( math.pow(10, -r['exp_mu_value_raw']) / pow(10, -r['exp_wt_value']), 3) typefold = r['exp_type'] + "_log" else: foldchange = round( r['exp_mu_value_raw'] / r['exp_wt_value'], 3) typefold = r['exp_type'] + "_not_log" if foldchange < 1 and foldchange != 0: foldchange = -round((1 / foldchange), 3) elif r['fold_effect'] != 0: foldchange = round(r['fold_effect'], 3) if foldchange < 1: foldchange = -round((1 / foldchange), 3) raw_experiment = self.insert_raw(r) obj, created = MutationExperiment.objects.get_or_create( refs=pub, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref=l_ref, raw=raw_experiment, optional=exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual=exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value=r['exp_mu_value_raw'], mu_sign=r['exp_mu_effect_sign'], foldchange=foldchange) mut_id = obj.id inserted += 1 self.logger.info('Parsed ' + str(c) + ' mutant data entries. Skipped ' + str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1), reverse=True) sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1), reverse=True) self.logger.info('COMPLETED CREATING MUTANTS')
def main_func(self, positions, iteration,count,lock): # print(positions,iteration,count,lock) ligands = self.ligand_dump while count.value<len(ligands): with lock: l = ligands[count.value] count.value +=1 if count.value % 10000 == 0: print('{} Status {} out of {}'.format( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S'), count.value, len(ligands))) if 'logp' not in l: # temp skip to only use "full" annotated ligands continue lp = LigandProperities.objects.filter(inchikey=l['inchikey']).first() ligand = None if lp: # Check if inchikey is there ligand = Ligand.objects.filter(name=l['name'], properities=lp).prefetch_related('properities__ligand_type','properities__web_links','properities__vendors').first() # The name with corresponding inchikey is there, assume all is good and skip. # Will add links to make sure they're there. if not ligand: if lp: print(l['name'],'is there! (but not by name, only inchi') ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() else: # No ligand seems to match by inchikey -- start creating it. # Make LigandProperities first lt, created = LigandType.objects.get_or_create(slug=l['ligand_type__slug'],defaults = {'name':l['ligand_type__name']}) lp = LigandProperities() lp.inchikey = l['inchikey'] lp.smiles = l['smiles'] lp.mw = l['mw'] lp.logp = l['logp'] lp.rotatable_bonds = l['rotatable_bonds'] lp.hacc = l['hacc'] lp.hdon = l['hdon'] lp.ligand_type = lt lp.save() ligand = Ligand() ligand.properities = lp ligand.name = l['name'] ligand.canonical = l['canonical'] ligand.ambigious_alias = l['ambigious_alias'] ligand.save() # create links - impossible to make duplicates so no need to check if there already if ligand.properities.web_links.count()<len(l['web_links']): for link in l['web_links']: wr = WebResource.objects.get(slug=link['web_resource']) wl, created = WebLink.objects.get_or_create(index=link['index'], web_resource=wr) ligand.properities.web_links.add(wl) # create vendors - impossible to make duplicates so no need to check if there already if ligand.properities.vendors.count()<len(l['vendors']): for link in l['vendors']: lv = LigandVendors.objects.get(slug = link['vendor_slug']) check = LigandVendorLink.objects.filter(sid=link['sid']).exists() if not check: lvl = LigandVendorLink() lvl.sid = link['sid'] lvl.vendor = lv lvl.lp = ligand.properities lvl.vendor_external_id = link['vendor_external_id'] lvl.url = link['url'] lvl.save()
def main_func(self, positions, iteration): # filenames if not positions[1]: rows = self.data[positions[0]:] else: rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() for r in rows: # print(source_file,c) # PRINT IF ERRORS OCCUR # self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) pub.web_link = wl if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: pub_review = Publication.objects.get(web_link__index=r['review'], web_link__web_resource__slug=pub_type) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) pub_review.web_link = wl if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 self.logger.error('Skipped due to no protein '+ r['protein']) continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) skipped += 1 continue if r['ligand_class']: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) else: exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); typefold = r['exp_type']+"_log" else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); raw_experiment = self.insert_raw(r) bulk = MutationExperiment( refs=pub, review=pub_review, protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, #raw = raw_experiment, #raw_experiment, OR None optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange ) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
def main_func(self, positions, iteration,count,lock): # filenames # if not positions[1]: # rows = self.data[positions[0]:] # else: # rows = self.data[positions[0]:positions[1]] missing_proteins = {} mutants_for_proteins = {} wrong_uniport_ids = {} c = 0 skipped = 0 inserted = 0 bulk_m = [] bulk_r = [] current_sheet = time.time() rows = self.data_all while count.value<len(rows): with lock: r = rows[count.value] count.value +=1 # for r in rows: # print(r['source_file'],c) # PRINT IF ERRORS OCCUR #self.logger.info('File '+str(r['source_file'])+' number '+str(c)) current = time.time() c += 1 # if c%100==0: # self.logger.info('Parsed '+str(c)+' mutant data entries') # publication try: #fix if it thinks it's float. float(r['reference']) r['reference'] = str(int(r['reference'])) float(r['review']) r['review'] = str(int(r['review'])) except ValueError: pass if r['reference'].isdigit(): #assume pubmed pub_type = 'pubmed' else: #assume doi pub_type = 'doi' if r['reference'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['reference'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type) try: pub = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub = Publication() try: pub.web_link = wl pub.save() except IntegrityError: pub = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub.update_from_doi(doi=r['reference']) elif pub_type == 'pubmed': pub.update_from_pubmed_data(index=r['reference']) try: pub.save() except: self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['reference']] = pub else: pub = self.publication_cache[r['reference']] # print(r['review'],r['reference']) if r['review'].isdigit(): #assume pubmed pub_type = 'pubmed' elif r['review'].startswith('http'): pub_type = 'raw_link' else: #assume doi pub_type = 'doi' # print(r['review'],pub_type) if r['review']: if r['review'] not in self.publication_cache: try: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) except WebLink.DoesNotExist: try: wl = WebLink.objects.create(index=r['review'], web_resource = WebResource.objects.get(slug=pub_type)) except IntegrityError: wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type) try: pub_review = Publication.objects.get(web_link=wl) except Publication.DoesNotExist: pub_review = Publication() try: pub_review.web_link = wl pub_review.save() except IntegrityError: pub_review = Publication.objects.get(web_link=wl) if pub_type == 'doi': pub_review.update_from_doi(doi=r['review']) elif pub_type == 'pubmed': pub_review.update_from_pubmed_data(index=r['review']) try: pub_review.save() except: self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type) continue #if something off with publication, skip. self.publication_cache[r['review']] = pub_review else: pub_review = self.publication_cache[r['review']] else: pub_review = None l = None if str(r['ligand_name']) in self.ligand_cache: if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]: l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] else: self.ligand_cache[str(r['ligand_name'])] = {} if not l: try: l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name'])) except Exception as msg: print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file']) print(msg) traceback.print_exc() continue self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l l_ref = None if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache: l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] else: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False) elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = False l_ref.ambigious_alias = True l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) l_ref.save() elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) l_ref.ambigious_alias = False l_ref.save() elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status. lp = LigandProperities() lp.save() l_ref = Ligand() l_ref.properities = lp l_ref.name = r['exp_mu_ligand_ref'] l_ref.canonical = True l_ref.ambigious_alias = False try: l_ref.save() l_ref.load_by_name(r['exp_mu_ligand_ref']) except IntegrityError: if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True) else: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") try: l_ref.save() except IntegrityError: l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False) # print("error failing ligand, duplicate?") # logger.error("FAILED SAVING LIGAND, duplicate?") else: l_ref = None self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref protein_id = 0 residue_id = 0 protein=Protein.objects.filter(entry_name=r['protein']) if protein.exists(): protein=protein.get() if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 elif r['protein'] not in missing_proteins: try: r['protein'] = wrong_uniport_ids[r['protein']] real_uniprot = wrong_uniport_ids[r['protein']] protein=Protein.objects.get(entry_name=r['protein']) # print('fetched with lookup table',r['protein']) except: # look for it as uniprot protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper()) if protein.exists(): protein=protein.get() real_uniprot = protein.entry_name if r['protein'] in mutants_for_proteins: mutants_for_proteins[r['protein']] += 1 else: mutants_for_proteins[r['protein']] = 1 else: # Try to lookup in uniprot to catch typing errors / variants in entry_name url = 'http://www.uniprot.org/uniprot/$index.xml' cache_dir = ['uniprot', 'id'] uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True) try: real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower() protein=Protein.objects.get(entry_name=real_uniprot) except: skipped += 1 if r['protein'] in missing_proteins: missing_proteins[r['protein']] += 1 else: missing_proteins[r['protein']] = 1 # print('Skipped due to no protein '+ r['protein']) self.logger.error('Skipped due to no protein '+ r['protein']) continue wrong_uniport_ids[r['protein']] = protein.entry_name r['protein'] = real_uniprot else: missing_proteins[r['protein']] += 1 continue res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK if res.exists(): res=res.get() else: self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from']) # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file']) skipped += 1 continue if r['ligand_class']: try: l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'], defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed except Exception as e: if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists(): l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50]) if l_role.name == slugify(r['ligand_class'])[:50]: #if name of role is same as slug, then it was created by constructs script, replace it l_role.name = r['ligand_class'] l_role.save() else: print(e) print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] ) l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed else: l_role = None if r['exp_type']: exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type']) else: exp_type_id = None if r['exp_func']: exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func']) else: exp_func_id = None if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']: exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop']) else: exp_qual_id = None # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']: # exp_opt_id, created = MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist']) # else: # exp_opt_id = None try: mutation, created = Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res) except IntegrityError: mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res) logtypes = ['pEC50','pIC50','pK'] foldchange = 0 typefold = '' if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']): #-log values! try: foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3); except: print(r) typefold = r['exp_type']+"_log" elif "%"==r['exp_wt_unit']: # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3); else: foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3); typefold = r['exp_type']+"_not_log" if foldchange>0 and foldchange<1 and foldchange!=0: foldchange = -round((1/foldchange),3) elif r['fold_effect']!=0: foldchange = round(r['fold_effect'],3); if foldchange<1: foldchange = -round((1/foldchange),3); r['fold_effect'] = foldchange raw_experiment = self.insert_raw(r) # raw_experiment.save() bulk = MutationExperiment( refs=pub, review=pub_review, submitting_group = r['submitting_group'], data_container = r['data_container'], data_container_number = r['data_container_number'], protein=protein, residue=res, ligand=l, ligand_role=l_role, ligand_ref = l_ref, # raw = raw_experiment, #raw_experiment, OR None # optional = exp_opt_id, exp_type=exp_type_id, exp_func=exp_func_id, exp_qual = exp_qual_id, mutation=mutation, wt_value=r['exp_wt_value'], # wt_unit=r['exp_wt_unit'], mu_value = r['exp_mu_value_raw'], mu_sign = r['exp_mu_effect_sign'], foldchange = foldchange, opt_receptor_expression = r['opt_receptor_expression'], opt_basal_activity = r['opt_basal_activity'], opt_gain_of_activity = r['opt_gain_of_activity'], opt_ligand_emax = r['opt_ligand_emax'], opt_agonist = r['opt_agonist'], ) # for line,val in r.items(): # val = str(val) # if len(val)>100: # print(line,"too long",val) # mut_id = obj.id bulk_r.append(raw_experiment) bulk_m.append(bulk) # try: # bulk.save() # except Exception as e: # print(e) # print(r) # break #print('saved ',r['source_file']) inserted += 1 end = time.time() diff = round(end - current,2) #print(diff) self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped)) current = time.time() raws = MutationRaw.objects.bulk_create(bulk_r) for i,me in enumerate(bulk_m): me.raw = raws[i] MutationExperiment.objects.bulk_create(bulk_m) end = time.time() diff = round(end - current,2) current_sheet diff_2 = round(end - current_sheet,2) print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped)) sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)