Пример #1
0
	def create_publication(self, doi, wr):
		'''Create WebLink and Publication objects'''
		if doi!='':
			try:
				pub = Publication.objects.get(web_link__index=doi, web_link__web_resource=wr)
			except Publication.DoesNotExist as e:
				pub = Publication()
				wl, created = WebLink.objects.get_or_create(index=doi, web_resource=wr)
				pub.web_link = wl
				pub.update_from_doi(doi=doi)
				pub.save()
				self.logger.info('Created Publication:'+str(pub))
			return pub
		else:
			return None
    def fetch_publication(self, publication_doi):
        """
        fetch publication with Publication model
        requires: publication doi or pmid
        """
        try:
            float(publication_doi)
            publication_doi = str(int(publication_doi))
        except ValueError:
            pass

        if publication_doi.isdigit():  # assume pubmed
            pub_type = 'pubmed'
        else:  # assume doi
            pub_type = 'doi'
        if publication_doi not in self.publication_cache:
            try:
                wl = WebLink.objects.get(index=publication_doi,
                                         web_resource__slug=pub_type)
            except WebLink.DoesNotExist:
                try:
                    wl = WebLink.objects.create(
                        index=publication_doi,
                        web_resource=WebResource.objects.get(slug=pub_type))
                except IntegrityError:
                    wl = WebLink.objects.get(index=publication_doi,
                                             web_resource__slug=pub_type)

            try:
                pub = Publication.objects.get(web_link=wl)
            except Publication.DoesNotExist:
                pub = Publication()
                try:
                    pub.web_link = wl
                    pub.save()
                except IntegrityError:
                    pub = Publication.objects.get(web_link=wl)

                if pub_type == 'doi':
                    pub.update_from_doi(doi=publication_doi)
                elif pub_type == 'pubmed':
                    pub.update_from_pubmed_data(index=publication_doi)
                try:
                    pub.save()
                except:
                    self.mylog.debug(
                        "publication fetching error | module: fetch_publication. Row # is : "
                        + str(publication_doi) + ' ' + pub_type)
                    # if something off with publication, skip.
            self.publication_cache[publication_doi] = pub
        else:
            pub = self.publication_cache[publication_doi]

        return pub
Пример #3
0
    def main_func(self, positions, iteration,count,lock):

        missing_proteins = {}

        c = 0
        skipped = 0
        rows = self.data_all

        while count.value<len(rows):
            with lock:
                r = rows[count.value]
                count.value +=1 

            current = time.time()
            c += 1

            # publication
            try: #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(index=r['reference'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)


                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                        continue #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]


            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))
                except Exception as msg:
                    print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l


            protein=Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein=protein.get()

            elif r['protein'] not in missing_proteins:
                # Can contain code to try to figure out what protein it is.
                pass
            else:
                missing_proteins[r['protein']] += 1
                continue

            res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
            if res.exists():
                res=res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

        self.logger.info('Parsed '+str(c)+' bias data entries. Skipped '+str(skipped))

        sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)
        print(missing_proteins)
Пример #4
0
    def new_xtals(self, uniprot):
        ''' List GPCR crystal structures missing from GPCRdb and the yaml files. Adds missing structures to DB.
        '''
        structs = self.pdb_request_by_uniprot(uniprot)
        try:
            protein = Protein.objects.get(accession=uniprot)
        except:
            protein = None
        try:
            x50s = Residue.objects.filter(protein_conformation__protein=protein,generic_number__label__in=['1x50','2x50','3x50','4x50','5x50','6x50','7x50'])
        except:
            x50s = None
        if structs!=['null']:
            for s in structs:
                missing_from_db, missing_yaml = False, False
                try:
                    st_obj = Structure.objects.get(pdb_code__index=s)
                except:
                    if s not in self.exceptions:
                        check = self.pdb_request_by_pdb(s)
                        if check==1:
                            self.db_list.append(s)
                            missing_from_db = True
                if s not in self.yamls and s not in self.exceptions:
                    if s not in self.db_list:
                        check = self.pdb_request_by_pdb(s)
                    else:
                        check = 1
                    if check==1:
                        self.yaml_list.append(s)
                        missing_yaml = True
                if not missing_from_db:
                    continue
                try:
                    pdb_data_dict = fetch_pdb_info(s, protein, new_xtal=True)
                    exp_method = pdb_data_dict['experimental_method']
                    if exp_method=='Electron Microscopy':
                        st_type = StructureType.objects.get(slug='electron-microscopy')
                    elif exp_method=='X-ray diffraction':
                        st_type = StructureType.objects.get(slug='x-ray-diffraction')
                    if 'deletions' in pdb_data_dict:
                        for d in pdb_data_dict['deletions']:
                            presentx50s = []
                            for x in x50s:
                                if not d['start']<x.sequence_number<d['end']:
                                    presentx50s.append(x)                                    
                            # Filter out ones without all 7 x50 positions present in the xtal
                            if len(presentx50s)!=7:
                                try:
                                    del self.db_list[self.db_list.index(s)]
                                    missing_from_db = False
                                    del self.yaml_list[self.yaml_list.index(s)]
                                except:
                                    pass
                    else:
                        print('Warning: no deletions in pdb info, check {}'.format(s))
                        continue

                    if missing_from_db:
                        pref_chain = ''
                        resolution = pdb_data_dict['resolution']
                        pdb_code, created = WebLink.objects.get_or_create(index=s, web_resource=WebResource.objects.get(slug='pdb'))
                        pdbl = PDB.PDBList()
                        pdbl.retrieve_pdb_file(s, pdir='./', file_format="pdb")
                        with open('./pdb{}.ent'.format(s).lower(),'r') as f:
                            lines = f.readlines()
                        pdb_file = ''
                        publication_date, pubmed, doi = '','',''
                        state = ProteinState.objects.get(slug='inactive')
                        new_prot, created = Protein.objects.get_or_create(entry_name=s.lower(), accession=None, name=s.lower(), sequence=pdb_data_dict['wt_seq'], family=protein.family,
                                                                          parent=protein, residue_numbering_scheme=protein.residue_numbering_scheme, 
                                                                          sequence_type=ProteinSequenceType.objects.get(slug='mod'), source=ProteinSource.objects.get(name='OTHER'), 
                                                                          species=protein.species)
                        new_prot_conf, created = ProteinConformation.objects.get_or_create(protein=new_prot, state=state, template_structure=None)
                        for line in lines:
                            if line.startswith('REVDAT   1'):
                                publication_date = line[13:22]
                            if line.startswith('JRNL        PMID'):
                                pubmed = line[19:].strip()
                            if line.startswith('JRNL        DOI'):
                                doi = line[19:].strip()
                            pdb_file+=line
                        pdb_data, created = PdbData.objects.get_or_create(pdb=pdb_file)
                        d = datetime.strptime(publication_date,'%d-%b-%y')
                        publication_date = d.strftime('%Y-%m-%d')
                        try:
                            if doi!='':
                                try:
                                    publication = Publication.objects.get(web_link__index=doi)
                                except Publication.DoesNotExist as e:
                                    p = Publication()
                                    try:
                                        p.web_link = WebLink.objects.get(index=doi, web_resource__slug='doi')
                                    except WebLink.DoesNotExist:
                                        wl = WebLink.objects.create(index=doi,
                                            web_resource = WebResource.objects.get(slug='doi'))
                                        p.web_link = wl
                                    p.update_from_doi(doi=doi)
                                    p.save()
                                    publication = p
                            elif pubmed!='':
                                try:
                                    publication = Publication.objects.get(web_link__index=pubmed)
                                except Publication.DoesNotExist as e:
                                    p = Publication()
                                    try:
                                        p.web_link = WebLink.objects.get(index=pubmed,
                                            web_resource__slug='pubmed')
                                    except WebLink.DoesNotExist:
                                        wl = WebLink.objects.create(index=pubmed,
                                            web_resource = WebResource.objects.get(slug='pubmed'))
                                        p.web_link = wl
                                    p.update_from_pubmed_data(index=pubmed)
                                    p.save()
                                    publication = p
                        except:
                            pass
                        pcs = PdbChainSelector(s, protein)
                        pcs.run_dssp()
                        preferred_chain = pcs.select_chain()

                        # Run state identification

                        # Create yaml files
                        with open('../../data/protwis/gpcr/structure_data/constructs/{}.yaml'.format(pdb_code.index), 'w') as construct_file:
                            yaml.dump({'name': pdb_code.index.lower(), 'protein': protein.entry_name}, construct_file, indent=4)
                        with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'w') as structure_file:
                            struct_yaml_dict = {'construct': pdb_code.index.lower(), 'pdb': pdb_code.index, 'preferred_chain': preferred_chain, 'auxiliary_protein': '', 
                                                'ligand': {'name': 'None', 'pubchemId': 'None', 'title': 'None', 'role': '.nan', 'type': 'None'}, 'signaling_protein': 'None', 'state': 'Inactive'}
                            auxiliary_proteins, ligands = [], []
                            if pdb_data_dict['ligands']!='None':
                                for key, values in pdb_data_dict['ligands'].items():
                                    if key in ['SO4','NA','CLR','OLA','OLB','OLC','TAR','NAG','EPE','BU1','ACM','GOL','PEG','PO4','TLA','BOG','CIT','PLM','BMA','MAN','MLI','PGE']:
                                        continue
                                    else:
                                        ligands.append({'name': key, 'pubchemId': 'None', 'title': pdb_data_dict['ligands'][key]['comp_name'], 'role': '.nan', 'type': 'None'})
                                for key, values in pdb_data_dict['auxiliary'].items():
                                    if pdb_data_dict['auxiliary'][key]['subtype'] in ['Expression tag', 'Linker']:
                                        continue
                                    else:
                                        auxiliary_proteins.append(pdb_data_dict['auxiliary'][key]['subtype'])
                                for key, values in pdb_data_dict['construct_sequences'].items():
                                    if key!=protein.entry_name and key not in struct_yaml_dict['auxiliary_protein']:
                                        if 'arrestin' in key:
                                            struct_yaml_dict['signaling_protein'] = key
                                if len(auxiliary_proteins)>1:
                                    struct_yaml_dict['auxiliary_protein'] = ', '.join(auxiliary_proteins)
                                if len(ligands)>1:
                                    struct_yaml_dict['ligand'] = ligands
                            yaml.dump(struct_yaml_dict, structure_file, indent=4, default_flow_style=False)

                        # Build residue table for structure
                        build_structure_command = shlex.split('/env/bin/python3 manage.py build_structures -f {}.yaml'.format(pdb_code.index))
                        subprocess.call(build_structure_command)

                        # Check state
                        struct = Structure.objects.get(pdb_code__index=pdb_code.index)
                        pi = PdbStateIdentifier(struct)
                        pi.run()
                        if pi.state!=None:
                            Structure.objects.filter(pdb_code__index=pdb_code.index).update(state=pi.state)
                            print(pi.state, pi.activation_value)
                            with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'r') as yf:
                                struct_yaml = yaml.load(yf)
                            struct_yaml['state'] = pi.state.name
                            try:
                                struct_yaml['distance'] = round(float(pi.activation_value), 2)
                            except:
                                struct_yaml['distance'] = None
                            with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'w') as struct_yaml_file:
                                yaml.dump(struct_yaml, struct_yaml_file, indent=4, default_flow_style=False)
                
                        # Check sodium pocket
                        new_prot_conf.sodium_pocket()

                        print('{} added to db (preferred_chain chain: {})'.format(s, preferred_chain))
                except Exception as msg:
                    print(s, msg)
Пример #5
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            rows = self.data[positions[0]:]
        else:
            rows = self.data[positions[0]:positions[1]]

        missing_proteins = {}
        mutants_for_proteins = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        for r in rows:
            # print(source_file,c)
            # PRINT IF ERRORS OCCUR
            # self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try:  #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            else:  #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    pub = Publication.objects.get(
                        web_link__index=r['reference'],
                        web_link__web_resource__slug=pub_type)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = WebLink.objects.get(
                            index=r['reference'], web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        wl = WebLink.objects.create(
                            index=r['reference'],
                            web_resource=WebResource.objects.get(
                                slug=pub_type))
                        pub.web_link = wl

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' +
                                          str(r['reference']) + ' ' + pub_type)
                        continue  #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            else:  #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        pub_review = Publication.objects.get(
                            web_link__index=r['review'],
                            web_link__web_resource__slug=pub_type)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = WebLink.objects.get(
                                index=r['review'], web_resource__slug=pub_type)
                        except WebLink.DoesNotExist:
                            wl = WebLink.objects.create(
                                index=r['review'],
                                web_resource=WebResource.objects.get(
                                    slug=pub_type))
                            pub_review.web_link = wl

                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(
                                index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' +
                                              str(r['review']) + ' ' +
                                              pub_type)
                            continue  #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(
                        r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                l = get_or_make_ligand(r['ligand_id'], r['ligand_type'],
                                       str(r['ligand_name']))
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l

            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'], canonical=True
                ).exists(
                ):  #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=True)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=False
                ).exists(
                ):  #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False,
                                               ambigious_alias=False)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=True
                ).exists(
                ):  #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                           canonical=False).exists(
                                           ):  #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']:  #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                                   canonical=False)
                        print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref

            protein_id = 0
            residue_id = 0

            protein = Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein = protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            else:
                skipped += 1
                if r['protein'] in missing_proteins:
                    missing_proteins[r['protein']] += 1
                else:
                    missing_proteins[r['protein']] = 1
                    self.logger.error('Skipped due to no protein ' +
                                      r['protein'])
                continue

            res = Residue.objects.filter(
                protein_conformation__protein=protein,
                amino_acid=r['mutation_from'],
                sequence_number=r['mutation_pos'])  #FIXME MAKE AA CHECK
            if res.exists():
                res = res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' +
                                  r['protein'] + ' pos:' +
                                  str(r['mutation_pos']) + ' AA:' +
                                  r['mutation_from'])
                skipped += 1
                continue

            if r['ligand_class']:
                l_role, created = LigandRole.objects.get_or_create(
                    name=r['ligand_class'],
                    defaults={'slug': slugify(r['ligand_class'])[:50]
                              })  # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(
                    type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(
                    func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(
                    qual=r['exp_mu_effect_qual'],
                    prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[
                    'opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r[
                        'opt_agonist']:
                exp_opt_id, created = MutationOptional.objects.get_or_create(
                    type=r['opt_type'],
                    wt=r['opt_wt'],
                    mu=r['opt_mu'],
                    sign=r['opt_sign'],
                    percentage=r['opt_percentage'],
                    qual=r['opt_qual'],
                    agonist=r['opt_agonist'])
            else:
                exp_opt_id = None

            try:
                mutation, created = Mutation.objects.get_or_create(
                    amino_acid=r['mutation_to'], protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],
                                                protein=protein,
                                                residue=res)
            logtypes = ['pEC50', 'pIC50', 'pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value'] != 0 and r[
                    'exp_mu_value_raw'] != 0:  #fix for new format

                if re.match("(" + ")|(".join(logtypes) + ")",
                            r['exp_type']):  #-log values!
                    foldchange = round(
                        math.pow(10, -r['exp_mu_value_raw']) /
                        pow(10, -r['exp_wt_value']), 3)
                    typefold = r['exp_type'] + "_log"
                else:
                    foldchange = round(
                        r['exp_mu_value_raw'] / r['exp_wt_value'], 3)
                    typefold = r['exp_type'] + "_not_log"

                if foldchange < 1 and foldchange != 0:
                    foldchange = -round((1 / foldchange), 3)
            elif r['fold_effect'] != 0:
                foldchange = round(r['fold_effect'], 3)
                if foldchange < 1: foldchange = -round((1 / foldchange), 3)

            raw_experiment = self.insert_raw(r)
            bulk = MutationExperiment(
                refs=pub,
                review=pub_review,
                protein=protein,
                residue=res,
                ligand=l,
                ligand_role=l_role,
                ligand_ref=l_ref,
                #raw = raw_experiment, #raw_experiment, OR None
                optional=exp_opt_id,
                exp_type=exp_type_id,
                exp_func=exp_func_id,
                exp_qual=exp_qual_id,
                mutation=mutation,
                wt_value=r['exp_wt_value'],  #
                wt_unit=r['exp_wt_unit'],
                mu_value=r['exp_mu_value_raw'],
                mu_sign=r['exp_mu_effect_sign'],
                foldchange=foldchange)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            inserted += 1
            end = time.time()
            diff = round(end - current, 2)
            #print(diff)

        self.logger.info('Parsed ' + str(c) +
                         ' mutant data entries. Skipped ' + str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i, me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current, 2)
        current_sheet
        diff_2 = round(end - current_sheet, 2)
        print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped",
              str(skipped))
Пример #6
0
    def main_func(self, positions, iteration, count, lock):
        # filenames
        # if not positions[1]:
        #     rows = self.data[positions[0]:]
        # else:
        #     rows = self.data[positions[0]:positions[1]]

        missing_proteins = {}
        mutants_for_proteins = {}
        wrong_uniport_ids = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        rows = self.data_all
        while count.value < len(rows):
            with lock:
                r = rows[count.value]
                count.value += 1
        # for r in rows:
        # print(r['source_file'],c)
        # PRINT IF ERRORS OCCUR
        #self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try:  #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            else:  #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'],
                                             web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(
                            index=r['reference'],
                            web_resource=WebResource.objects.get(
                                slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'],
                                                 web_resource__slug=pub_type)

                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' +
                                          str(r['reference']) + ' ' + pub_type)
                        continue  #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit():  #assume pubmed
                pub_type = 'pubmed'
            elif r['review'].startswith('http'):
                pub_type = 'raw_link'
            else:  #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        wl = WebLink.objects.get(index=r['review'],
                                                 web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        try:
                            wl = WebLink.objects.create(
                                index=r['review'],
                                web_resource=WebResource.objects.get(
                                    slug=pub_type))
                        except IntegrityError:
                            wl = WebLink.objects.get(
                                index=r['review'], web_resource__slug=pub_type)

                    try:
                        pub_review = Publication.objects.get(web_link=wl)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = wl
                            pub_review.save()
                        except IntegrityError:
                            pub_review = Publication.objects.get(web_link=wl)

                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(
                                index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' +
                                              str(r['review']) + ' ' +
                                              pub_type)
                            continue  #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(
                        r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'], r['ligand_type'],
                                           str(r['ligand_name']))
                except Exception as msg:
                    print(
                        'Something errored with ligand, aborting entry of mutation',
                        r['ligand_name'], r['ligand_type'], r['ligand_id'],
                        r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l

            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'], canonical=True
                ).exists(
                ):  #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=True)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=False
                ).exists(
                ):  #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False,
                                               ambigious_alias=False)
                elif Ligand.objects.filter(
                        name=r['exp_mu_ligand_ref'],
                        canonical=False,
                        ambigious_alias=True
                ).exists(
                ):  #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                           canonical=False).exists(
                                           ):  #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                               canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']:  #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    try:
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    except IntegrityError:
                        if Ligand.objects.filter(name=r['exp_mu_ligand_ref'],
                                                 canonical=True).exists():
                            l_ref = Ligand.objects.get(
                                name=r['exp_mu_ligand_ref'], canonical=True)
                        else:
                            l_ref = Ligand.objects.get(
                                name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                                   canonical=False)
                        # print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref

            protein_id = 0
            residue_id = 0

            protein = Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein = protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            elif r['protein'] not in missing_proteins:

                try:
                    r['protein'] = wrong_uniport_ids[r['protein']]
                    real_uniprot = wrong_uniport_ids[r['protein']]
                    protein = Protein.objects.get(entry_name=r['protein'])
                    # print('fetched with lookup table',r['protein'])
                except:
                    # look for it as uniprot
                    protein = Protein.objects.filter(
                        web_links__web_resource__slug='uniprot',
                        web_links__index=r['protein'].upper())
                    if protein.exists():
                        protein = protein.get()
                        real_uniprot = protein.entry_name
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1
                    else:
                        # Try to lookup in uniprot to catch typing errors / variants in entry_name
                        url = 'http://www.uniprot.org/uniprot/$index.xml'
                        cache_dir = ['uniprot', 'id']
                        uniprot_protein = fetch_from_web_api(url,
                                                             r['protein'],
                                                             cache_dir,
                                                             xml=True)
                        try:
                            real_uniprot = uniprot_protein.find(
                                './/{http://uniprot.org/uniprot}name'
                            ).text.lower()
                            protein = Protein.objects.get(
                                entry_name=real_uniprot)
                        except:
                            skipped += 1
                            if r['protein'] in missing_proteins:
                                missing_proteins[r['protein']] += 1
                            else:
                                missing_proteins[r['protein']] = 1
                                # print('Skipped due to no protein '+ r['protein'])
                                self.logger.error(
                                    'Skipped due to no protein ' +
                                    r['protein'])
                            continue
                    wrong_uniport_ids[r['protein']] = protein.entry_name
                    r['protein'] = real_uniprot
            else:
                missing_proteins[r['protein']] += 1
                continue

            res = Residue.objects.filter(
                protein_conformation__protein=protein,
                amino_acid=r['mutation_from'],
                sequence_number=r['mutation_pos'])  #FIXME MAKE AA CHECK
            if res.exists():
                res = res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' +
                                  r['protein'] + ' pos:' +
                                  str(r['mutation_pos']) + ' AA:' +
                                  r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

            if r['ligand_class']:
                try:
                    l_role, created = LigandRole.objects.get_or_create(
                        name=r['ligand_class'],
                        defaults={'slug': slugify(r['ligand_class'])[:50]
                                  })  # FIXME this should not be needed
                except Exception as e:
                    if LigandRole.objects.filter(
                            slug=slugify(r['ligand_class'])[:50]).exists():
                        l_role = LigandRole.objects.get(
                            slug=slugify(r['ligand_class'])[:50])
                        if l_role.name == slugify(r['ligand_class'])[:50]:
                            #if name of role is same as slug, then it was created by constructs script, replace it
                            l_role.name = r['ligand_class']
                            l_role.save()
                    else:
                        print(e)
                        print("Error with", r['ligand_class'],
                              slugify(r['ligand_class'])[:50])
                        l_role, created = LigandRole.objects.get_or_create(
                            slug=slugify(r['ligand_class'])
                            [:50])  # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(
                    type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(
                    func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(
                    qual=r['exp_mu_effect_qual'],
                    prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
            #     exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            # else:
            #     exp_opt_id = None

            try:
                mutation, created = Mutation.objects.get_or_create(
                    amino_acid=r['mutation_to'], protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],
                                                protein=protein,
                                                residue=res)
            logtypes = ['pEC50', 'pIC50', 'pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value'] != 0 and r[
                    'exp_mu_value_raw'] != 0:  #fix for new format
                if re.match("(" + ")|(".join(logtypes) + ")",
                            r['exp_type']):  #-log values!
                    try:
                        foldchange = round(
                            math.pow(10, -r['exp_mu_value_raw']) /
                            pow(10, -r['exp_wt_value']), 3)
                    except:
                        print(r)
                    typefold = r['exp_type'] + "_log"
                elif "%" == r['exp_wt_unit']:
                    # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better
                    foldchange = round(
                        r['exp_wt_value'] / r['exp_mu_value_raw'], 3)
                else:
                    foldchange = round(
                        r['exp_mu_value_raw'] / r['exp_wt_value'], 3)
                    typefold = r['exp_type'] + "_not_log"
                if foldchange > 0 and foldchange < 1 and foldchange != 0:
                    foldchange = -round((1 / foldchange), 3)
            elif r['fold_effect'] != 0:
                foldchange = round(r['fold_effect'], 3)
                if foldchange < 1: foldchange = -round((1 / foldchange), 3)
            r['fold_effect'] = foldchange

            raw_experiment = self.insert_raw(r)
            # raw_experiment.save()
            bulk = MutationExperiment(
                refs=pub,
                review=pub_review,
                submitting_group=r['submitting_group'],
                data_container=r['data_container'],
                data_container_number=r['data_container_number'],
                protein=protein,
                residue=res,
                ligand=l,
                ligand_role=l_role,
                ligand_ref=l_ref,
                # raw = raw_experiment, #raw_experiment, OR None
                # optional = exp_opt_id,
                exp_type=exp_type_id,
                exp_func=exp_func_id,
                exp_qual=exp_qual_id,
                mutation=mutation,
                wt_value=r['exp_wt_value'],  #
                wt_unit=r['exp_wt_unit'],
                mu_value=r['exp_mu_value_raw'],
                mu_sign=r['exp_mu_effect_sign'],
                foldchange=foldchange,
                opt_receptor_expression=r['opt_receptor_expression'],
                opt_basal_activity=r['opt_basal_activity'],
                opt_gain_of_activity=r['opt_gain_of_activity'],
                opt_ligand_emax=r['opt_ligand_emax'],
                opt_agonist=r['opt_agonist'],
            )
            # for line,val in r.items():
            #     val = str(val)
            #     if len(val)>100:
            #         print(line,"too long",val)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            # try:
            #     bulk.save()
            # except Exception as e:
            #     print(e)
            #     print(r)
            #     break
            #print('saved ',r['source_file'])
            inserted += 1
            end = time.time()
            diff = round(end - current, 2)
            #print(diff)

        self.logger.info('Parsed ' + str(c) +
                         ' mutant data entries. Skipped ' + str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i, me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current, 2)
        # current_sheet
        diff_2 = round(end - current_sheet, 2)
        print("overall", diff_2, "bulk", diff, len(bulk_m), "skipped",
              str(skipped))
        sorted_missing_proteins = sorted(missing_proteins.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
Пример #7
0
    def create_mutant_data(self, filenames):
        self.logger.info('CREATING MUTANT DATA')
        
        # what files should be parsed?
        if not filenames:
            filenames = os.listdir(self.structure_data_dir)

        missing_proteins = {}
        mutants_for_proteins = {}

        for source_file in filenames:
            source_file_path = os.sep.join([self.structure_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file

                if source_file[-4:]=='xlsx' or source_file[-3:]=='xls':
                    rows = self.loaddatafromexcel(source_file_path)
                    rows = self.analyse_rows(rows)
                elif source_file[-4:]=='yaml':
                    rows = yaml.load(open(source_file_path, 'r'))
                    temp = []
                    for r in rows:
                        d = {}
                        d['reference'] = r['pubmed']
                        d['protein'] = r['entry_name'].replace("__","_").lower()
                        d['mutation_pos'] = r['seq']
                        d['mutation_from'] = r['from_res']
                        d['mutation_to'] = r['to_res']
                        d['ligand_name'] = ''
                        d['ligand_type'] = ''
                        d['ligand_id'] = ''
                        d['ligand_class'] = ''
                        d['exp_type'] = ''
                        d['exp_func'] = ''
                        d['exp_wt_value'] = 0
                        d['exp_wt_unit'] = ''
                        d['exp_mu_effect_sign'] = ''
                        d['exp_mu_value_raw'] = 0
                        d['fold_effect'] = 0
                        d['exp_mu_effect_qual'] = ''
                        d['exp_mu_effect_ligand_prop'] = ''
                        d['exp_mu_ligand_ref'] = ''
                        d['opt_type'] = ''
                        d['opt_wt'] = 0
                        d['opt_mu'] = 0
                        d['opt_sign'] = ''
                        d['opt_percentage'] = 0
                        d['opt_qual'] = ''
                        d['opt_agonist'] = ''
                        if len(d['mutation_to'])>1 or len(d['mutation_from'])>1: #if something is off with amino acid
                            continue
                        temp.append(d)
                    rows = temp
                else:
                    self.logger.info('unknown format'.source_file)
                    continue

                c = 0
                skipped = 0
                inserted = 0
                for r in rows:
                    c += 1
                    if c%1000==0: 
                        self.logger.info('Parsed '+str(c)+' mutant data entries')

                    # publication
                    try: #fix if it thinks it's float.
                        float(r['reference'])
                        r['reference'] = str(int(r['reference']))
                    except ValueError:
                        pass

                    if r['reference'].isdigit(): #assume pubmed
                        pub_type = 'pubmed'
                    else: #assume doi
                        pub_type = 'doi'

                    try:
                        pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type)
                    except Publication.DoesNotExist:
                        pub = Publication()
                        try:
                            pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                        except WebLink.DoesNotExist:
                            wl = WebLink.objects.create(index=r['reference'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                            pub.web_link = wl

                        if pub_type == 'doi':
                            pub.update_from_doi(doi=r['reference'])
                        elif pub_type == 'pubmed':
                            pub.update_from_pubmed_data(index=r['reference'])
                        try:
                            pub.save()
                        except:
                            self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                            continue #if something off with publication, skip.

                    if r['ligand_type']=='PubChem CID' or r['ligand_type']=='SMILES':
                        if r['ligand_type']=='PubChem CID':
                            pubchem_lookup_value = 'cid'
                        elif r['ligand_type']=='SMILES':
                            pubchem_lookup_value = 'smiles'

                        try:
                            web_resource = WebResource.objects.get(slug='pubchem')
                        except:
                            # abort if pdb resource is not found
                            raise Exception('PubChem resource not found, aborting!')

                        if 'ligand_name' in r and r['ligand_name']:
                            ligand_name = str(r['ligand_name'])
                        else:
                            ligand_name = False

                        try:
                            # if this name is canonical and it has a ligand record already
                            l = Ligand.objects.get(name=ligand_name, canonical=True,
                                properities__web_links__web_resource=web_resource,
                                properities__web_links__index=r['ligand_id'])
                        except Ligand.DoesNotExist:
                            try:
                                # if exists under different name
                                l_canonical = Ligand.objects.get(properities__web_links__web_resource=web_resource,
                                    properities__web_links__index=r['ligand_id'], canonical=True)
                                l, created = Ligand.objects.get_or_create(properities = l_canonical.properities,
                                    name = ligand_name, canonical = False)
                                if created:
                                    self.logger.info('Created ligand {}'.format(l.name))
                            except Ligand.DoesNotExist:
                                # fetch ligand from pubchem
                                default_ligand_type = 'Small molecule'
                                lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type),
                                    defaults={'name': default_ligand_type})
                                l = Ligand()
                                l = l.load_from_pubchem(pubchem_lookup_value, r['ligand_id'], lt, ligand_name)
                        
                    elif r['ligand_name']:
                        
                        # if this name is canonical and it has a ligand record already
                        if Ligand.objects.filter(name=r['ligand_name'], canonical=True).exists():
                            l = Ligand.objects.get(name=r['ligand_name'], canonical=True)
                        
                        # if this matches an alias that only has "one" parent canonical name - eg distinct
                        elif Ligand.objects.filter(name=r['ligand_name'], canonical=False,
                            ambigious_alias=False).exists():
                            l = Ligand.objects.get(name=r['ligand_name'], canonical=False, ambigious_alias=False)
                        
                        # if this matches an alias that only has several canonical parents, must investigate, start
                        # with empty.
                        elif Ligand.objects.filter(name=r['ligand_name'], canonical=False,
                            ambigious_alias=True).exists():
                            lp = LigandProperities()
                            lp.save()
                            l = Ligand()
                            l.properities = lp
                            l.name = r['ligand_name']
                            l.canonical = False
                            l.ambigious_alias = True
                            l.save()
                            l.load_by_name(r['ligand_name'])
                        
                        # if neither a canonical or alias exists, create the records. Remember to check for
                        # canonical / alias status.
                        else:
                            lp = LigandProperities()
                            lp.save()
                            l = Ligand()
                            l.properities = lp
                            l.name = str(r['ligand_name'])
                            l.canonical = True
                            l.ambigious_alias = False
                            l.save()
                            l.load_by_name(str(r['ligand_name']))
                    else:
                        l = None

                    if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                    elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False)
                    elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                        lp = LigandProperities()
                        lp.save()
                        l_ref = Ligand()
                        l_ref.properities = lp
                        l_ref.name = r['exp_mu_ligand_ref']
                        l_ref.canonical = False
                        l_ref.ambigious_alias = True
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                        l_ref.save()
                    elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                        lp = LigandProperities()
                        lp.save()
                        l_ref = Ligand()
                        l_ref.properities = lp
                        l_ref.name = r['exp_mu_ligand_ref']
                        l_ref.canonical = True
                        l_ref.ambigious_alias = False
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                        l_ref.save()
                    else:
                        l_ref = None

                    protein_id = 0
                    residue_id = 0

                    protein=Protein.objects.filter(entry_name=r['protein'])
                    if protein.exists():
                        protein=protein.get()
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1

                    else:
                        skipped += 1
                        if r['protein'] in missing_proteins:
                            missing_proteins[r['protein']] += 1
                        else:
                            missing_proteins[r['protein']] = 1
                            self.logger.error('Skipped due to no protein '+ r['protein'])
                        continue

                    res=Residue.objects.filter(protein_conformation__protein=protein,sequence_number=r['mutation_pos'])
                    if res.exists():
                        res=res.get()
                    else:
                        self.logger.error('Skipped due to no residue ' + r['protein'] + ' pos:'+str(r['mutation_pos']))
                        skipped += 1
                        continue

                    if r['ligand_class']:
                        l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'],
                            defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed
                    else:
                        l_role = None

                    if r['exp_type']:
                        exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type'])
                    else:
                        exp_type_id = None

                    if r['exp_func']:
                        exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func'])
                    else:
                        exp_func_id = None

                    if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                        exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop'])
                    else:
                        exp_qual_id = None

                    if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
                        exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
                    else:
                        exp_opt_id = None

                    mutation, created =  Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res)

                    
                    logtypes = ['pEC50','pIC50','pK']
                    
                    
                    foldchange = 0
                    typefold = ''
                    if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format
                                
                        if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']):  #-log values!
                            foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3);
                            typefold = r['exp_type']+"_log"
                        else:
                            foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3);
                            typefold = r['exp_type']+"_not_log"
                        
                        
                        if foldchange<1 and foldchange!=0:
                            foldchange = -round((1/foldchange),3)
                    elif r['fold_effect']!=0:
                            foldchange = round(r['fold_effect'],3);
                            if foldchange<1: foldchange = -round((1/foldchange),3);
                    

                    raw_experiment = self.insert_raw(r)
                    obj, created = MutationExperiment.objects.get_or_create(
                    refs=pub, 
                    protein=protein, 
                    residue=res, 
                    ligand=l, 
                    ligand_role=l_role, 
                    ligand_ref = l_ref,
                    raw = raw_experiment,
                    optional = exp_opt_id,
                    exp_type=exp_type_id, 
                    exp_func=exp_func_id, 
                    exp_qual = exp_qual_id,

                    mutation=mutation, 
                    wt_value=r['exp_wt_value'], #
                    wt_unit=r['exp_wt_unit'], 

                    mu_value = r['exp_mu_value_raw'],
                    mu_sign = r['exp_mu_effect_sign'], 
                    foldchange = foldchange
                    )
                    mut_id = obj.id
                    inserted += 1

                self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped))

        sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)
        sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(), key=operator.itemgetter(1),reverse=True)

        self.logger.info('COMPLETED CREATING MUTANTS')
 def build_g_prot_struct(self, alpha_prot, pdb, data):
     ss = SignprotStructure()
     pdb_code, p_c = WebLink.objects.get_or_create(
         index=pdb, web_resource=WebResource.objects.get(slug='pdb'))
     pub_date = data['release_date']
     # Structure type
     if 'x-ray' in data['method'].lower():
         structure_type_slug = 'x-ray-diffraction'
     elif 'electron' in data['method'].lower():
         structure_type_slug = 'electron-microscopy'
     else:
         structure_type_slug = '-'.join(data['method'].lower().split(' '))
     try:
         structure_type = StructureType.objects.get(
             slug=structure_type_slug)
     except StructureType.DoesNotExist as e:
         structure_type, c = StructureType.objects.get_or_create(
             slug=structure_type_slug, name=data['method'])
         self.logger.info('Created StructureType:' + str(structure_type))
     # Publication
     if data['doi']:
         try:
             pub = Publication.objects.get(web_link__index=data['doi'])
         except Publication.DoesNotExist as e:
             pub = Publication()
             wl, created = WebLink.objects.get_or_create(
                 index=data['doi'],
                 web_resource=WebResource.objects.get(slug='doi'))
             pub.web_link = wl
             pub.update_from_pubmed_data(index=data['doi'])
             pub.save()
             self.logger.info('Created Publication:' + str(pub))
     else:
         if data['pubmedId']:
             try:
                 pub = Publication.objects.get(
                     web_link__index=data['pubmedId'])
             except Publication.DoesNotExist as e:
                 pub = Publication()
                 wl, created = WebLink.objects.get_or_create(
                     index=data['pubmedId'],
                     web_resource=WebResource.objects.get(slug='pubmed'))
                 pub.web_link = wl
                 pub.update_from_pubmed_data(index=data['pubmedId'])
                 pub.save()
                 self.logger.info('Created Publication:' + str(pub))
         else:
             pub = None
     ss.pdb_code = pdb_code
     ss.structure_type = structure_type
     ss.resolution = data['resolution']
     ss.publication_date = pub_date
     ss.publication = pub
     ss.protein = alpha_prot
     ss.save()
     # Stabilizing agent
     for o in data['other']:
         if len(o) > 75:
             continue
         if o == 'REGULATOR OF G-PROTEIN SIGNALING 14':
             o = 'Regulator of G-protein signaling 14'
         elif o == 'Nanobody 35':
             o = 'Nanobody-35'
         elif o == 'ADENYLATE CYCLASE, TYPE V':
             o = 'Adenylate cyclase, type V'
         elif o == '1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta-3':
             o = '1-phosphatidylinositol 4,5-bisphosphate phosphodiesterase beta-3'
         stabagent, sa_created = StructureStabilizingAgent.objects.get_or_create(
             slug=o.replace(' ', '-').replace(' ', '-'), name=o)
         ss.stabilizing_agents.add(stabagent)
     ss.save()
     # Extra proteins
     # Alpha - ### A bit redundant, consider changing this in the future
     if data['alpha']:
         alpha_sep = SignprotStructureExtraProteins()
         alpha_sep.wt_protein = alpha_prot
         alpha_sep.structure = ss
         alpha_sep.protein_conformation = ProteinConformation.objects.get(
             protein=alpha_prot)
         alpha_sep.display_name = self.display_name_lookup[
             alpha_prot.family.name]
         alpha_sep.note = None
         alpha_sep.chain = data['alpha_chain']
         alpha_sep.category = 'G alpha'
         cov = round(data['alpha_coverage'] / len(alpha_prot.sequence) *
                     100)
         if cov > 100:
             self.logger.warning(
                 "SignprotStructureExtraProtein Alpha subunit sequence coverage of {} is {}% which is longer than 100% in structure {}"
                 .format(alpha_sep, cov, ss))
             cov = 100
         alpha_sep.wt_coverage = cov
         alpha_sep.save()
         # ss.extra_proteins.add(alpha_sep)
     # Beta
     if data['beta']:
         beta_prot = Protein.objects.get(accession=data['beta'])
         beta_sep = SignprotStructureExtraProteins()
         beta_sep.wt_protein = beta_prot
         beta_sep.structure = ss
         beta_sep.protein_conformation = ProteinConformation.objects.get(
             protein=beta_prot)
         beta_sep.display_name = self.display_name_lookup[beta_prot.name]
         beta_sep.note = None
         beta_sep.chain = data['beta_chain']
         beta_sep.category = 'G beta'
         beta_sep.wt_coverage = None
         beta_sep.save()
         # ss.extra_proteins.add(beta_sep)
     # Gamma
     if data['gamma']:
         gamma_prot = Protein.objects.get(accession=data['gamma'])
         gamma_sep = SignprotStructureExtraProteins()
         gamma_sep.wt_protein = gamma_prot
         gamma_sep.structure = ss
         gamma_sep.protein_conformation = ProteinConformation.objects.get(
             protein=gamma_prot)
         gamma_sep.display_name = self.display_name_lookup[gamma_prot.name]
         gamma_sep.note = None
         gamma_sep.chain = data['gamma_chain']
         gamma_sep.category = 'G gamma'
         gamma_sep.wt_coverage = None
         gamma_sep.save()
         # ss.extra_proteins.add(gamma_sep)
     # ss.save()
     self.logger.info('Created SignprotStructure: {}'.format(ss.pdb_code))
 def build_g_prot_struct(self, alpha_prot, pdb, data):
     ss = SignprotStructure()
     pdb_code, p_c = WebLink.objects.get_or_create(
         index=pdb, web_resource=WebResource.objects.get(slug="pdb"))
     pub_date = data["release_date"]
     # Structure type
     if "x-ray" in data["method"].lower():
         structure_type_slug = "x-ray-diffraction"
     elif "electron" in data["method"].lower():
         structure_type_slug = "electron-microscopy"
     else:
         structure_type_slug = "-".join(data["method"].lower().split(" "))
     try:
         structure_type = StructureType.objects.get(
             slug=structure_type_slug)
     except StructureType.DoesNotExist as e:
         structure_type, c = StructureType.objects.get_or_create(
             slug=structure_type_slug, name=data["method"])
         self.logger.info("Created StructureType:" + str(structure_type))
     # Publication
     if data["doi"]:
         try:
             pub = Publication.objects.get(web_link__index=data["doi"])
         except Publication.DoesNotExist as e:
             pub = Publication()
             wl, created = WebLink.objects.get_or_create(
                 index=data["doi"],
                 web_resource=WebResource.objects.get(slug="doi"))
             pub.web_link = wl
             pub.update_from_pubmed_data(index=data["doi"])
             pub.save()
             self.logger.info("Created Publication:" + str(pub))
     else:
         if data["pubmedId"]:
             try:
                 pub = Publication.objects.get(
                     web_link__index=data["pubmedId"])
             except Publication.DoesNotExist as e:
                 pub = Publication()
                 wl, created = WebLink.objects.get_or_create(
                     index=data["pubmedId"],
                     web_resource=WebResource.objects.get(slug="pubmed"))
                 pub.web_link = wl
                 pub.update_from_pubmed_data(index=data["pubmedId"])
                 pub.save()
                 self.logger.info("Created Publication:" + str(pub))
         else:
             pub = None
     # PDB data
     url = 'https://www.rcsb.org/pdb/files/{}.pdb'.format(pdb)
     req = urllib.request.Request(url)
     with urllib.request.urlopen(req) as response:
         pdbdata_raw = response.read().decode('utf-8')
     pdbdata_object = PdbData.objects.get_or_create(pdb=pdbdata_raw)[0]
     ss.pdb_code = pdb_code
     ss.structure_type = structure_type
     ss.resolution = data["resolution"]
     ss.publication_date = pub_date
     ss.publication = pub
     ss.protein = alpha_prot
     ss.pdb_data = pdbdata_object
     ss.save()
     # Stabilizing agent
     for o in data["other"]:
         if len(o) > 75:
             continue
         if o == "REGULATOR OF G-PROTEIN SIGNALING 14":
             o = "Regulator of G-protein signaling 14"
         elif o == "Nanobody 35":
             o = "Nanobody-35"
         elif o == "ADENYLATE CYCLASE, TYPE V":
             o = "Adenylate cyclase, type V"
         elif o == "1-phosphatidylinositol-4,5-bisphosphate phosphodiesterase beta-3":
             o = "1-phosphatidylinositol 4,5-bisphosphate phosphodiesterase beta-3"
         stabagent, sa_created = StructureStabilizingAgent.objects.get_or_create(
             slug=o.replace(" ", "-").replace(" ", "-"), name=o)
         ss.stabilizing_agents.add(stabagent)
     ss.save()
     # Extra proteins
     # Alpha - ### A bit redundant, consider changing this in the future
     if data["alpha"]:
         alpha_sep = SignprotStructureExtraProteins()
         alpha_sep.wt_protein = alpha_prot
         alpha_sep.structure = ss
         alpha_sep.protein_conformation = ProteinConformation.objects.get(
             protein=alpha_prot)
         alpha_sep.display_name = self.display_name_lookup[
             alpha_prot.family.name]
         alpha_sep.note = None
         alpha_sep.chain = data["alpha_chain"]
         alpha_sep.category = "G alpha"
         cov = round(data["alpha_coverage"] / len(alpha_prot.sequence) *
                     100)
         if cov > 100:
             self.logger.warning(
                 "SignprotStructureExtraProtein Alpha subunit sequence coverage of {} is {}% which is longer than 100% in structure {}"
                 .format(alpha_sep, cov, ss))
             cov = 100
         alpha_sep.wt_coverage = cov
         alpha_sep.save()
         # ss.extra_proteins.add(alpha_sep)
     # Beta
     if data["beta"]:
         beta_prot = Protein.objects.get(accession=data["beta"])
         beta_sep = SignprotStructureExtraProteins()
         beta_sep.wt_protein = beta_prot
         beta_sep.structure = ss
         beta_sep.protein_conformation = ProteinConformation.objects.get(
             protein=beta_prot)
         beta_sep.display_name = self.display_name_lookup[beta_prot.name]
         beta_sep.note = None
         beta_sep.chain = data["beta_chain"]
         beta_sep.category = "G beta"
         beta_sep.wt_coverage = None
         beta_sep.save()
         # ss.extra_proteins.add(beta_sep)
     # Gamma
     if data["gamma"]:
         gamma_prot = Protein.objects.get(accession=data["gamma"])
         gamma_sep = SignprotStructureExtraProteins()
         gamma_sep.wt_protein = gamma_prot
         gamma_sep.structure = ss
         gamma_sep.protein_conformation = ProteinConformation.objects.get(
             protein=gamma_prot)
         gamma_sep.display_name = self.display_name_lookup[gamma_prot.name]
         gamma_sep.note = None
         gamma_sep.chain = data["gamma_chain"]
         gamma_sep.category = "G gamma"
         gamma_sep.wt_coverage = None
         gamma_sep.save()
         # ss.extra_proteins.add(gamma_sep)
     # ss.save()
     self.logger.info("Created SignprotStructure: {}".format(ss.pdb_code))
Пример #10
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        for source_file in filenames:
            source_file_path = os.sep.join([self.structure_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)
                    
                    # is this a representative structure (will be used to guide structure-based alignments)?
                    representative = False
                    if 'representative' in sd and sd['representative']:
                        representative = True

                    # only process representative structures on first iteration
                    if not representative and iteration == 1:
                        continue

                    # skip representative structures on second iteration
                    if representative and iteration == 2:
                        continue

                    # is there a construct?
                    if 'construct' not in sd:
                        self.logger.error('No construct specified, skipping!')
                        continue

                    # does the construct exists?
                    try:
                        con = Protein.objects.get(entry_name=sd['construct'])
                    except Protein.DoesNotExist:
                        self.logger.error('Construct {} does not exists, skipping!'.format(sd['construct']))
                        continue

                    # create a structure record
                    try:
                        s = Structure.objects.get(protein_conformation__protein=con)
                    except Structure.DoesNotExist:
                        s = Structure()
                        s.representative = representative

                    # protein state
                    if 'state' not in sd:
                        self.logger.warning('State not defined, using default state {}'.format(
                            settings.DEFAULT_PROTEIN_STATE))
                        state = settings.DEFAULT_STATE.title()
                    else:
                        state = sd['state']
                    state_slug = slugify(state)
                    try:
                        ps, created = ProteinState.objects.get_or_create(slug=state_slug, defaults={'name': state})
                        if created:
                            self.logger.info('Created protein state {}'.format(ps.name))
                    except IntegrityError:
                        ps = ProteinState.objects.get(slug=state_slug)
                    s.state = ps

                    # protein conformation
                    try:
                        s.protein_conformation = ProteinConformation.objects.get(protein=con)
                    except ProteinConformation.DoesNotExist:
                        self.logger.error('Protein conformation for construct {} does not exists'.format(con))
                        continue
                    if s.protein_conformation.state is not state:
                        ProteinConformation.objects.filter(protein=con).update(state=ps)

                    # get the PDB file and save to DB
                    sd['pdb'] = sd['pdb'].upper()
                    if not os.path.exists(self.pdb_data_dir):
                        os.makedirs(self.pdb_data_dir)
                    
                    pdb_path = os.sep.join([self.pdb_data_dir, sd['pdb'] + '.pdb'])
                    if not os.path.isfile(pdb_path):
                        self.logger.info('Fetching PDB file {}'.format(sd['pdb']))
                        url = 'http://www.rcsb.org/pdb/files/%s.pdb' % sd['pdb']
                        pdbdata_raw = urlopen(url).read().decode('utf-8')
                        with open(pdb_path, 'w') as f:
                            f.write(pdbdata_raw)
                    else:
                        with open(pdb_path, 'r') as pdb_file:
                            pdbdata_raw = pdb_file.read()
                    
                    pdbdata, created = PdbData.objects.get_or_create(pdb=pdbdata_raw)
                    s.pdb_data = pdbdata

                    # UPDATE HETSYN with its PDB reference instead + GRAB PUB DATE, PMID, DOI AND RESOLUTION
                    hetsyn = {}
                    hetsyn_reverse = {}
                    for line in pdbdata_raw.splitlines():
                        if line.startswith('HETSYN'): 
                            m = re.match("HETSYN[\s]+([\w]{3})[\s]+(.+)",line) ### need to fix bad PDB formatting where col4 and col5 are put together for some reason -- usually seen when the id is +1000
                            if (m):
                                hetsyn[m.group(2).strip()] = m.group(1).upper()
                                hetsyn_reverse[m.group(1)] = m.group(2).strip().upper()
                        if line.startswith('HETNAM'): 
                            m = re.match("HETNAM[\s]+([\w]{3})[\s]+(.+)",line) ### need to fix bad PDB formatting where col4 and col5 are put together for some reason -- usually seen when the id is +1000
                            if (m):
                                hetsyn[m.group(2).strip()] = m.group(1).upper()
                                hetsyn_reverse[m.group(1)] = m.group(2).strip().upper()
                        if line.startswith('REVDAT   1'):
                            sd['publication_date'] = line[13:22]
                        if line.startswith('JRNL        PMID'):
                            sd['pubmed_id'] = line[19:].strip()
                        if line.startswith('JRNL        DOI'):
                            sd['doi_id'] = line[19:].strip()

                    if len(hetsyn) == 0:
                        self.logger.info("PDB file contained NO hetsyn")

                    with open(pdb_path,'r') as header:
                        header_dict = parse_pdb_header(header)
                    sd['publication_date'] = header_dict['release_date']
                    sd['resolution'] = str(header_dict['resolution']).strip()
                    sd['structure_method'] = header_dict['structure_method']

                    # structure type
                    if 'structure_method' in sd and sd['structure_method']:
                        structure_type = sd['structure_method'].capitalize()
                        structure_type_slug = slugify(sd['structure_method'])
                        
                        try:
                            st, created = StructureType.objects.get_or_create(slug=structure_type_slug,
                                defaults={'name': structure_type})
                            if created:
                                self.logger.info('Created structure type {}'.format(st))
                        except IntegrityError:
                            st = StructureType.objects.get(slug=structure_type_slug)
                        s.structure_type = st
                    else:
                        self.logger.warning('No structure type specified in PDB file {}'.format(sd['pdb']))

                    matched = 0
                    if 'ligand' in sd and sd['ligand']:
                        if isinstance(sd['ligand'], list):
                            ligands = sd['ligand']
                        else:
                            ligands = [sd['ligand']]
                        for ligand in ligands:
                            if 'name' in ligand:
                                if ligand['name'].upper() in hetsyn:
                                    self.logger.info('Ligand {} matched to PDB records'.format(ligand['name']))
                                    matched = 1
                                    ligand['name'] = hetsyn[ligand['name'].upper()]
                                elif ligand['name'].upper() in hetsyn_reverse:
                                    matched = 1

                    if matched==0 and len(hetsyn)>0:
                        self.logger.info('No ligand names found in HET in structure {}'.format(sd['pdb']))

                    # REMOVE? can be used to dump structure files with updated ligands
                    # yaml.dump(sd, open(source_file_path, 'w'), indent=4)

                    # pdb code
                    if 'pdb' in sd:
                        try:
                            web_resource = WebResource.objects.get(slug='pdb')
                        except:
                            # abort if pdb resource is not found
                            raise Exception('PDB resource not found, aborting!')
                        s.pdb_code, created = WebLink.objects.get_or_create(index=sd['pdb'],
                            web_resource=web_resource)
                    else:
                        self.logger.error('PDB code not specified for structure {}, skipping!'.format(sd['pdb']))
                        continue

                    # insert into plain text fields
                    if 'preferred_chain' in sd:
                        s.preferred_chain = sd['preferred_chain']
                    else:
                        self.logger.warning('Preferred chain not specified for structure {}'.format(sd['pdb']))
                    if 'resolution' in sd:
                        s.resolution = float(sd['resolution'])
                    else:
                        self.logger.warning('Resolution not specified for structure {}'.format(sd['pdb']))
                    if 'publication_date' in sd:
                        s.publication_date = sd['publication_date']
                    else:
                        self.logger.warning('Publication date not specified for structure {}'.format(sd['pdb']))

                    # publication
                    try:                     
                        if 'doi_id' in sd:
                            try:
                                s.publication = Publication.objects.get(web_link__index=sd['doi_id'])
                            except Publication.DoesNotExist as e:
                                p = Publication()
                                try:
                                    p.web_link = WebLink.objects.get(index=sd['doi_id'], web_resource__slug='doi')
                                except WebLink.DoesNotExist:
                                    wl = WebLink.objects.create(index=sd['doi_id'],
                                        web_resource = WebResource.objects.get(slug='doi'))
                                    p.web_link = wl
                                p.update_from_doi(doi=sd['doi_id'])
                                p.save()
                                s.publication = p
                        elif 'pubmed_id' in sd:
                            try:
                                s.publication = Publication.objects.get(web_link__index=sd['pubmed_id'])
                            except Publication.DoesNotExist as e:
                                p = Publication()
                                try:
                                    p.web_link = WebLink.objects.get(index=sd['pubmed_id'],
                                        web_resource__slug='pubmed')
                                except WebLink.DoesNotExist:
                                    wl = WebLink.objects.create(index=sd['pubmed_id'],
                                        web_resource = WebResource.objects.get(slug='pubmed'))
                                    p.web_link = wl
                                p.update_from_pubmed_data(index=sd['pubmed_id'])
                                p.save()
                                s.publication = p
                    except:
                        self.logger.error('Error saving publication'.format(ps.name))

                    # save structure before adding M2M relations
                    s.save()

                    #Delete previous interaction data to prevent errors.
                    ResidueFragmentInteraction.objects.filter(structure_ligand_pair__structure=s).delete()
                    StructureLigandInteraction.objects.filter(structure=s).delete()
                    #Remove previous Rotamers/Residues to prepare repopulate
                    Fragment.objects.filter(structure=s).delete()
                    Rotamer.objects.filter(structure=s).all().delete()
                    Residue.objects.filter(protein_conformation=s.protein_conformation).all().delete()

                    # endogenous ligand(s)
                    default_ligand_type = 'Small molecule'
                    if representative and 'endogenous_ligand' in sd and sd['endogenous_ligand']:
                        if isinstance(sd['endogenous_ligand'], list):
                            endogenous_ligands = sd['endogenous_ligand']
                        else:
                            endogenous_ligands = [sd['endogenous_ligand']]
                        for endogenous_ligand in endogenous_ligands:
                            if endogenous_ligand['type']:
                                lt, created = LigandType.objects.get_or_create(slug=slugify(endogenous_ligand['type']),
                                    defaults={'name': endogenous_ligand['type']})
                            else:
                                lt, created = LigandType.objects.get_or_create(slug=slugify(default_ligand_type),
                                    defaults={'name': default_ligand_type})
                            ligand = Ligand()

                            if 'iupharId' not in endogenous_ligand:
                                endogenous_ligand['iupharId'] = 0

                            ligand = ligand.load_by_gtop_id(endogenous_ligand['name'], endogenous_ligand['iupharId'],
                                lt)
                            try:
                                s.protein_conformation.protein.parent.endogenous_ligands.add(ligand)
                            except IntegrityError:
                                self.logger.info('Endogenous ligand for protein {}, already added. Skipping.'.format(
                                    s.protein_conformation.protein.parent))

                    # ligands
                    if 'ligand' in sd and sd['ligand']:
                        if isinstance(sd['ligand'], list):
                            ligands = sd['ligand']
                        else:
                            ligands = [sd['ligand']]
                        for ligand in ligands:
                            l = False
                            peptide_chain = ""
                            if 'chain' in ligand:
                                peptide_chain = ligand['chain']
                                ligand['name'] = 'pep'
                            if ligand['name'] and ligand['name'] != 'None': # some inserted as none.

                                # use annoted ligand type or default type
                                if ligand['type']:
                                    lt, created = LigandType.objects.get_or_create(slug=slugify(ligand['type']),
                                        defaults={'name': ligand['type']})
                                else:
                                    lt, created = LigandType.objects.get_or_create(
                                        slug=slugify(default_ligand_type), defaults={'name': default_ligand_type})

                                # set pdb reference for structure-ligand interaction
                                pdb_reference = ligand['name']

                                # use pubchem_id
                                if 'pubchemId' in ligand and ligand['pubchemId'] and ligand['pubchemId'] != 'None':
                                    # create ligand
                                    l = Ligand()


                                    # update ligand by pubchem id
                                    ligand_title = False
                                    if 'title' in ligand and ligand['title']:
                                        ligand_title = ligand['title']
                                    l = l.load_from_pubchem('cid', ligand['pubchemId'], lt, ligand_title)


                                # if no pubchem id is specified, use name
                                else:
                                    # use ligand title, if specified
                                    if 'title' in ligand and ligand['title']:
                                        ligand['name'] = ligand['title']

                                    # create empty properties
                                    lp = LigandProperities.objects.create()
                                    
                                    # create the ligand
                                    try:
                                        l, created = Ligand.objects.get_or_create(name=ligand['name'], canonical=True,
                                            defaults={'properities': lp, 'ambigious_alias': False})
                                        if created:
                                            self.logger.info('Created ligand {}'.format(ligand['name']))
                                        else:
                                            pass
                                    except IntegrityError:
                                        l = Ligand.objects.get(name=ligand['name'], canonical=True)

                                    # save ligand
                                    l.save()
                            else:
                                continue

                            # structure-ligand interaction
                            if l and ligand['role']:
                                role_slug = slugify(ligand['role'])
                                try:
                                    lr, created = LigandRole.objects.get_or_create(slug=role_slug,
                                    defaults={'name': ligand['role']})
                                    if created:
                                        self.logger.info('Created ligand role {}'.format(ligand['role']))
                                except IntegrityError:
                                    lr = LigandRole.objects.get(slug=role_slug)

                                i, created = StructureLigandInteraction.objects.get_or_create(structure=s,
                                    ligand=l, ligand_role=lr, annotated=True,
                                    defaults={'pdb_reference': pdb_reference})
                                if i.pdb_reference != pdb_reference:
                                    i.pdb_reference = pdb_reference
                                    i.save()


                    
                    # structure segments
                    if 'segments' in sd and sd['segments']:
                        for segment, positions in sd['segments'].items():
                            # fetch (create if needed) sequence segment
                            try:
                                protein_segment = ProteinSegment.objects.get(slug=segment)
                            except ProteinSegment.DoesNotExist:
                                self.logger.error('Segment {} not found'.format(segment))
                                continue

                            struct_seg, created = StructureSegment.objects.update_or_create(structure=s,
                                protein_segment=protein_segment, defaults={'start': positions[0], 'end': positions[1]})
                    # all representive structures should have defined segments
                    elif representative:
                        self.logger.warning('Segments not defined for representative structure {}'.format(sd['pdb']))

                    # structure segments for modeling
                    if 'segments_in_structure' in sd and sd['segments_in_structure']:
                        for segment, positions in sd['segments_in_structure'].items():
                            # fetch (create if needed) sequence segment
                            try:
                                protein_segment = ProteinSegment.objects.get(slug=segment)
                            except ProteinSegment.DoesNotExist:
                                self.logger.error('Segment {} not found'.format(segment))
                                continue

                            struct_seg_mod, created = StructureSegmentModeling.objects.update_or_create(structure=s,
                                protein_segment=protein_segment, defaults={'start': positions[0], 'end': positions[1]})

                    # structure coordinates
                    if 'coordinates' in sd and sd['coordinates']:
                        for segment, coordinates in sd['coordinates'].items():
                            # fetch (create if needed) sequence segment
                            try:
                                protein_segment = ProteinSegment.objects.get(slug=segment)
                            except ProteinSegment.DoesNotExist:
                                self.logger.error('Segment {} not found'.format(segment))
                                continue

                            # fetch (create if needed) coordinates description
                            try:
                                description, created = StructureCoordinatesDescription.objects.get_or_create(
                                    text=coordinates)
                                if created:
                                    self.logger.info('Created structure coordinate description {}'.format(coordinates))
                            except IntegrityError:
                                description = StructureCoordinatesDescription.objects.get(text=coordinates)

                            sc = StructureCoordinates()
                            sc.structure = s
                            sc.protein_segment = protein_segment
                            sc.description = description
                            sc.save()

                    # structure engineering
                    if 'engineering' in sd and sd['engineering']:
                        for segment, engineering in sd['engineering'].items():
                            # fetch (create if needed) sequence segment
                            try:
                                protein_segment = ProteinSegment.objects.get(slug=segment)
                            except ProteinSegment.DoesNotExist:
                                self.logger.error('Segment {} not found'.format(segment))
                                continue

                            # fetch (create if needed) engineering description
                            try:
                                description, created = StructureEngineeringDescription.objects.get_or_create(
                                    text=engineering)
                                if created:
                                    self.logger.info('Created structure coordinate description {}'.format(engineering))
                            except IntegrityError:
                                description = StructureEngineeringDescription.objects.get(text=engineering)

                            se = StructureEngineering()
                            se.structure = s
                            se.protein_segment = protein_segment
                            se.description = description
                            se.save()

                    # protein anomalies
                    scheme = s.protein_conformation.protein.residue_numbering_scheme
                    if 'bulges' in sd and sd['bulges']:
                        pa_slug = 'bulge'
                        try:
                            pab, created = ProteinAnomalyType.objects.get_or_create(slug=pa_slug, defaults={
                                'name': 'Bulge'})
                            if created:
                                self.logger.info('Created protein anomaly type {}'.format(pab))
                        except IntegrityError:
                            pab = ProteinAnomalyType.objects.get(slug=pa_slug)
                        
                        for segment, bulges in sd['bulges'].items():
                            for bulge in bulges:
                                try:
                                    gn, created = ResidueGenericNumber.objects.get_or_create(label=bulge,
                                        scheme=scheme, defaults={'protein_segment': ProteinSegment.objects.get(
                                        slug=segment)})
                                    if created:
                                        self.logger.info('Created generic number {}'.format(gn))
                                except IntegrityError:
                                    gn =  ResidueGenericNumber.objects.get(label=bulge, scheme=scheme)

                                try:
                                    pa, created = ProteinAnomaly.objects.get_or_create(anomaly_type=pab,
                                        generic_number=gn)
                                    if created:
                                        self.logger.info('Created protein anomaly {}'.format(pa))
                                except IntegrityError:
                                    pa, created = ProteinAnomaly.objects.get(anomaly_type=pab, generic_number=gn)

                                s.protein_anomalies.add(pa)
                    if 'constrictions' in sd and sd['constrictions']:
                        pa_slug = 'constriction'
                        try:
                            pac, created = ProteinAnomalyType.objects.get_or_create(slug=pa_slug, defaults={
                                'name': 'Constriction'})
                            if created:
                                self.logger.info('Created protein anomaly type {}'.format(pac))
                        except IntegrityError:
                            pac = ProteinAnomalyType.objects.get(slug=pa_slug)
                        
                        for segment, constrictions in sd['constrictions'].items():
                            for constriction in constrictions:
                                try:
                                    gn, created = ResidueGenericNumber.objects.get_or_create(label=constriction,
                                        scheme=scheme, defaults={'protein_segment': ProteinSegment.objects.get(
                                        slug=segment)})
                                    if created:
                                        self.logger.info('Created generic number {}'.format(gn))
                                except IntegrityError:
                                    gn =  ResidueGenericNumber.objects.get(label=constriction, scheme=scheme)

                                try:
                                    pa, created = ProteinAnomaly.objects.get_or_create(anomaly_type=pac,
                                        generic_number=gn)
                                    if created:
                                        self.logger.info('Created protein anomaly {}'.format(pa))
                                except IntegrityError:
                                    pa, created = ProteinAnomaly.objects.get(anomaly_type=pac, generic_number=gn)

                                s.protein_anomalies.add(pa)
                    
                    # stabilizing agents, FIXME - redesign this!
                    # fusion proteins moved to constructs, use this for G-proteins and other agents?
                    aux_proteins = []
                    if 'signaling_protein' in sd and sd['signaling_protein'] and sd['signaling_protein'] != 'None':
                        aux_proteins.append('signaling_protein')
                    if 'auxiliary_protein' in sd and sd['auxiliary_protein'] and sd['auxiliary_protein'] != 'None':
                        aux_proteins.append('auxiliary_protein')
                    for index in aux_proteins:
                        if isinstance(sd[index], list):
                            aps = sd[index]
                        else:
                            aps = [sd[index]]
                        for aux_protein in aps:
                            aux_protein_slug = slugify(aux_protein)[:50]
                            try:
                                sa, created = StructureStabilizingAgent.objects.get_or_create(
                                    slug=aux_protein_slug, defaults={'name': aux_protein})
                            except IntegrityError:
                                sa = StructureStabilizingAgent.objects.get(slug=aux_protein_slug)
                            s.stabilizing_agents.add(sa)

                    # save structure
                    s.save()

                    self.logger.info('Calculate rotamers / residues')
                    self.create_rotamers(s,pdb_path)

                    self.logger.info('Calculate interactions') #Should not error anymore. If it does, fix.
                    runcalculation(sd['pdb'],peptide_chain)
                    parsecalculation(sd['pdb'],False)
Пример #11
0
    def new_xtals(self, uniprot):
        ''' List GPCR crystal structures missing from GPCRdb and the yaml files. Adds missing structures to DB.
        '''
        structs = self.pdb_request_by_uniprot(uniprot)
        try:
            protein = Protein.objects.get(accession=uniprot)
        except:
            protein = None
        try:
            x50s = Residue.objects.filter(protein_conformation__protein=protein,generic_number__label__in=['1x50','2x50','3x50','4x50','5x50','6x50','7x50'])
        except:
            x50s = None
        if structs!=['null']:
            for s in structs:
                missing_from_db, missing_yaml = False, False
                try:
                    st_obj = Structure.objects.get(pdb_code__index=s)
                except:
                    if s not in self.exceptions:
                        check = self.pdb_request_by_pdb(s)
                        if check==1:
                            self.db_list.append(s)
                            missing_from_db = True

                if s not in self.yamls and s not in self.exceptions:
                    if s not in self.db_list:
                        check = self.pdb_request_by_pdb(s)
                    else:
                        check = 1
                    if check==1:
                        self.yaml_list.append(s)
                        missing_yaml = True
                if not missing_from_db:
                    continue
                # try:
                pdb_data_dict = fetch_pdb_info(s, protein, new_xtal=True)
                exp_method = pdb_data_dict['experimental_method']
                if exp_method=='Electron Microscopy':
                    st_type = StructureType.objects.get(slug='electron-microscopy')
                elif exp_method=='X-ray diffraction':
                    st_type = StructureType.objects.get(slug='x-ray-diffraction')
                if 'deletions' in pdb_data_dict:
                    for d in pdb_data_dict['deletions']:
                        presentx50s = []
                        for x in x50s:
                            if not d['start']<x.sequence_number<d['end']:
                                presentx50s.append(x)
                        # Filter out ones without all 7 x50 positions present in the xtal
                        if len(presentx50s)!=7:
                            try:
                                del self.db_list[self.db_list.index(s)]
                                missing_from_db = False
                                del self.yaml_list[self.yaml_list.index(s)]
                            except:
                                pass
                else:
                    print('Warning: no deletions in pdb info, check {}'.format(s))
                    continue

                if missing_from_db:
                    pref_chain = ''
                    resolution = pdb_data_dict['resolution']
                    pdb_code, created = WebLink.objects.get_or_create(index=s, web_resource=WebResource.objects.get(slug='pdb'))
                    pdbl = PDB.PDBList()
                    pdbl.retrieve_pdb_file(s, pdir='./', file_format="pdb")
                    with open('./pdb{}.ent'.format(s).lower(),'r') as f:
                        lines = f.readlines()
                    pdb_file = ''
                    publication_date, pubmed, doi = '','',''
                    state = ProteinState.objects.get(slug='inactive')
                    new_prot, created = Protein.objects.get_or_create(entry_name=s.lower(), accession=None, name=s.lower(), sequence=pdb_data_dict['wt_seq'], family=protein.family,
                                                                      parent=protein, residue_numbering_scheme=protein.residue_numbering_scheme,
                                                                      sequence_type=ProteinSequenceType.objects.get(slug='mod'), source=ProteinSource.objects.get(name='OTHER'),
                                                                      species=protein.species)
                    new_prot_conf, created = ProteinConformation.objects.get_or_create(protein=new_prot, state=state, template_structure=None)
                    for line in lines:
                        if line.startswith('REVDAT   1'):
                            publication_date = line[13:22]
                        if line.startswith('JRNL        PMID'):
                            pubmed = line[19:].strip()
                        if line.startswith('JRNL        DOI'):
                            doi = line[19:].strip()
                        pdb_file+=line
                    pdb_data, created = PdbData.objects.get_or_create(pdb=pdb_file)
                    d = datetime.strptime(publication_date,'%d-%b-%y')
                    publication_date = d.strftime('%Y-%m-%d')
                    try:
                        if doi!='':
                            try:
                                publication = Publication.objects.get(web_link__index=doi)
                            except Publication.DoesNotExist as e:
                                p = Publication()
                                try:
                                    p.web_link = WebLink.objects.get(index=doi, web_resource__slug='doi')
                                except WebLink.DoesNotExist:
                                    wl = WebLink.objects.create(index=doi,
                                        web_resource = WebResource.objects.get(slug='doi'))
                                    p.web_link = wl
                                p.update_from_doi(doi=doi)
                                p.save()
                                publication = p
                        elif pubmed!='':
                            try:
                                publication = Publication.objects.get(web_link__index=pubmed)
                            except Publication.DoesNotExist as e:
                                p = Publication()
                                try:
                                    p.web_link = WebLink.objects.get(index=pubmed,
                                        web_resource__slug='pubmed')
                                except WebLink.DoesNotExist:
                                    wl = WebLink.objects.create(index=pubmed,
                                        web_resource = WebResource.objects.get(slug='pubmed'))
                                    p.web_link = wl
                                p.update_from_pubmed_data(index=pubmed)
                                p.save()
                                publication = p
                    except:
                        pass
                    pcs = PdbChainSelector(s, protein)
                    pcs.run_dssp()
                    preferred_chain = pcs.select_chain()

                    # Run state identification

                    # Create yaml files
                    with open(os.sep.join([settings.DATA_DIR, 'structure_data','constructs', '{}.yaml'.format(pdb_code.index)]), 'w') as construct_file:
                        yaml.dump({'name': pdb_code.index.lower(), 'protein': protein.entry_name}, construct_file, indent=4)
                    with open(os.sep.join([settings.DATA_DIR, 'structure_data','structures','{}.yaml'.format(pdb_code.index)]), 'w') as structure_file:
                        struct_yaml_dict = {'construct': pdb_code.index.lower(), 'pdb': pdb_code.index, 'preferred_chain': preferred_chain, 'auxiliary_protein': '',
                                            'ligand': {'name': 'None', 'pubchemId': 'None', 'title': 'None', 'role': '.nan', 'type': 'None'}, 'signaling_protein': 'None', 'state': 'Inactive'}
                        auxiliary_proteins, ligands = [], []
                        if pdb_data_dict['ligands']!='None':
                            for key, values in pdb_data_dict['ligands'].items():
                                if key in ['SO4','NA','CLR','OLA','OLB','OLC','TAR','NAG','EPE','BU1','ACM','GOL','PEG','PO4','TLA','BOG','CIT','PLM','BMA','MAN','MLI','PGE','SIN','PGO','MES','ZN','NO3','NI','MG','PG4']:
                                    continue
                                else:
                                    ligands.append({'name': key, 'pubchemId': 'None', 'title': pdb_data_dict['ligands'][key]['comp_name'], 'role': '.nan', 'type': 'None'})
                            for key, values in pdb_data_dict['auxiliary'].items():
                                if pdb_data_dict['auxiliary'][key]['subtype'] in ['Expression tag', 'Linker']:
                                    continue
                                else:
                                    auxiliary_proteins.append(pdb_data_dict['auxiliary'][key]['subtype'])
                            for key, values in pdb_data_dict['construct_sequences'].items():
                                if key!=protein.entry_name and key not in struct_yaml_dict['auxiliary_protein']:
                                    if 'arrestin' in key:
                                        struct_yaml_dict['signaling_protein'] = key
                            if len(auxiliary_proteins)>1:
                                struct_yaml_dict['auxiliary_protein'] = ', '.join(auxiliary_proteins)
                            if len(ligands)>1:
                                struct_yaml_dict['ligand'] = ligands
                        yaml.dump(struct_yaml_dict, structure_file, indent=4, default_flow_style=False)

                    # Build residue table for structure
                    build_structure_command = shlex.split('/env/bin/python3 manage.py build_structures -f {}.yaml'.format(pdb_code.index))
                    subprocess.call(build_structure_command)

                    # Check state
                    struct = Structure.objects.get(pdb_code__index=pdb_code.index)
                    pi = PdbStateIdentifier(struct)
                    pi.run()
                    if pi.state!=None:
                        Structure.objects.filter(pdb_code__index=pdb_code.index).update(state=pi.state)
                        print(pi.state, pi.activation_value)
                        with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'r') as yf:
                            struct_yaml = yaml.load(yf)
                        struct_yaml['state'] = pi.state.name
                        try:
                            struct_yaml['distance'] = round(float(pi.activation_value), 2)
                        except:
                            struct_yaml['distance'] = None
                        with open('../../data/protwis/gpcr/structure_data/structures/{}.yaml'.format(pdb_code.index), 'w') as struct_yaml_file:
                            yaml.dump(struct_yaml, struct_yaml_file, indent=4, default_flow_style=False)

                    # Check sodium pocket
                    new_prot_conf.sodium_pocket()

                    print('{} added to db (preferred_chain chain: {})'.format(s, preferred_chain))
Пример #12
0
    def new_xtals(self, uniprot):
        ''' List GPCR crystal structures missing from GPCRdb and the yaml files. Adds missing structures to DB.
        '''
        structs = self.pdb_request_by_uniprot(uniprot)
        try:
            protein = Protein.objects.get(accession=uniprot)
        except:
            protein = None
        try:
            x50s = Residue.objects.filter(
                protein_conformation__protein=protein,
                generic_number__label__in=[
                    '1x50', '2x50', '3x50', '4x50', '5x50', '6x50', '7x50'
                ])
        except:
            x50s = None
        if structs != ['null']:
            for s in structs:
                missing_from_db = False
                try:
                    st_obj = Structure.objects.get(pdb_code__index=s)
                except:
                    if s not in self.exceptions:
                        check = self.pdb_request_by_pdb(s)
                        if check == 1:
                            self.db_list.append(s)
                            missing_from_db = True
                if s not in self.yamls and s not in self.exceptions:
                    if s not in self.db_list:
                        check = self.pdb_request_by_pdb(s)
                    else:
                        check = 1
                    if check == 1:
                        self.yaml_list.append(s)
                if not missing_from_db:
                    continue
                try:
                    pdb_data_dict = fetch_pdb_info(s, protein)
                    exp_method = pdb_data_dict['experimental_method']
                    if exp_method == 'Electron Microscopy':
                        st_type, cr = StructureType.objects.get_or_create(
                            slug='electron-microscopy', name=exp_method)
                    elif exp_method == 'X-ray diffraction':
                        st_type = StructureType.objects.get(
                            slug='x-ray-diffraction')
                    if 'deletions' in pdb_data_dict:
                        for d in pdb_data_dict['deletions']:
                            presentx50s = []
                            for x in x50s:
                                if not d['start'] < x.sequence_number < d[
                                        'end']:
                                    presentx50s.append(x)
                            # Filter out ones without all 7 x50 positions present in the xtal
                            if len(presentx50s) != 7:
                                try:
                                    del self.db_list[self.db_list.index(s)]
                                    missing_from_db = False
                                    del self.yaml_list[self.yaml_list.index(s)]
                                except:
                                    pass
                    if missing_from_db:
                        pref_chain = ''
                        resolution = pdb_data_dict['resolution']
                        pdb_code, created = WebLink.objects.get_or_create(
                            index=s,
                            web_resource=WebResource.objects.get(slug='pdb'))
                        pdbl = PDB.PDBList()
                        pdbl.retrieve_pdb_file(s, pdir='./', file_format="pdb")
                        with open('./pdb{}.ent'.format(s).lower(), 'r') as f:
                            lines = f.readlines()
                        pdb_file = ''
                        publication_date, pubmed, doi = '', '', ''
                        state = ProteinState.objects.get(slug='inactive')
                        new_prot, created = Protein.objects.get_or_create(
                            entry_name=s.lower(),
                            accession=None,
                            name=s.lower(),
                            sequence=pdb_data_dict['wt_seq'],
                            family=protein.family,
                            parent=protein,
                            residue_numbering_scheme=protein.
                            residue_numbering_scheme,
                            sequence_type=ProteinSequenceType.objects.get(
                                slug='mod'),
                            source=ProteinSource.objects.get(name='OTHER'),
                            species=protein.species)
                        new_prot_conf, created = ProteinConformation.objects.get_or_create(
                            protein=new_prot,
                            state=state,
                            template_structure=None)
                        for line in lines:
                            if line.startswith('REVDAT   1'):
                                publication_date = line[13:22]
                            if line.startswith('JRNL        PMID'):
                                pubmed = line[19:].strip()
                            if line.startswith('JRNL        DOI'):
                                doi = line[19:].strip()
                            pdb_file += line
                        pdb_data, created = PdbData.objects.get_or_create(
                            pdb=pdb_file)
                        d = datetime.strptime(publication_date, '%d-%b-%y')
                        publication_date = d.strftime('%Y-%m-%d')
                        try:
                            if doi != '':
                                try:
                                    publication = Publication.objects.get(
                                        web_link__index=doi)
                                except Publication.DoesNotExist as e:
                                    p = Publication()
                                    try:
                                        p.web_link = WebLink.objects.get(
                                            index=doi,
                                            web_resource__slug='doi')
                                    except WebLink.DoesNotExist:
                                        wl = WebLink.objects.create(
                                            index=doi,
                                            web_resource=WebResource.objects.
                                            get(slug='doi'))
                                        p.web_link = wl
                                    p.update_from_doi(doi=doi)
                                    p.save()
                                    publication = p
                            elif pubmed != '':
                                try:
                                    publication = Publication.objects.get(
                                        web_link__index=pubmed)
                                except Publication.DoesNotExist as e:
                                    p = Publication()
                                    try:
                                        p.web_link = WebLink.objects.get(
                                            index=pubmed,
                                            web_resource__slug='pubmed')
                                    except WebLink.DoesNotExist:
                                        wl = WebLink.objects.create(
                                            index=pubmed,
                                            web_resource=WebResource.objects.
                                            get(slug='pubmed'))
                                        p.web_link = wl
                                    p.update_from_pubmed_data(index=pubmed)
                                    p.save()
                                    publication = p
                        except:
                            pass
                        pcs = PdbChainSelector(s, protein)
                        pcs.run_dssp()
                        preferred_chain = pcs.select_chain()
                        os.remove('./pdb{}.ent'.format(s).lower())

                        # Create new structure object
                        Structure.objects.get_or_create(
                            preferred_chain=preferred_chain,
                            resolution=resolution,
                            publication_date=publication_date,
                            representative='f',
                            pdb_code=pdb_code,
                            pdb_data=pdb_data,
                            protein_conformation=new_prot_conf,
                            publication=publication,
                            state=state,
                            structure_type=st_type)
                        print('{} added to db (preferred_chain chain: {})'.
                              format(s, preferred_chain))
                except Exception as msg:
                    print(msg)
Пример #13
0
    def create_mutant_data(self, filenames):
        self.logger.info('CREATING MUTANT DATA')

        # what files should be parsed?
        if not filenames:
            filenames = os.listdir(self.structure_data_dir)

        missing_proteins = {}
        mutants_for_proteins = {}

        for source_file in filenames:
            source_file_path = os.sep.join(
                [self.structure_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file

                if source_file[-4:] == 'xlsx' or source_file[-3:] == 'xls':
                    rows = self.loaddatafromexcel(source_file_path)
                    rows = self.analyse_rows(rows)
                elif source_file[-4:] == 'yaml':
                    rows = yaml.load(open(source_file_path, 'r'))
                    temp = []
                    for r in rows:
                        d = {}
                        d['reference'] = r['pubmed']
                        d['protein'] = r['entry_name'].replace("__",
                                                               "_").lower()
                        d['mutation_pos'] = r['seq']
                        d['mutation_from'] = r['from_res']
                        d['mutation_to'] = r['to_res']
                        d['ligand_name'] = ''
                        d['ligand_type'] = ''
                        d['ligand_id'] = ''
                        d['ligand_class'] = ''
                        d['exp_type'] = ''
                        d['exp_func'] = ''
                        d['exp_wt_value'] = 0
                        d['exp_wt_unit'] = ''
                        d['exp_mu_effect_sign'] = ''
                        d['exp_mu_value_raw'] = 0
                        d['fold_effect'] = 0
                        d['exp_mu_effect_qual'] = ''
                        d['exp_mu_effect_ligand_prop'] = ''
                        d['exp_mu_ligand_ref'] = ''
                        d['opt_type'] = ''
                        d['opt_wt'] = 0
                        d['opt_mu'] = 0
                        d['opt_sign'] = ''
                        d['opt_percentage'] = 0
                        d['opt_qual'] = ''
                        d['opt_agonist'] = ''
                        if len(d['mutation_to']) > 1 or len(
                                d['mutation_from']
                        ) > 1:  #if something is off with amino acid
                            continue
                        temp.append(d)
                    rows = temp
                else:
                    self.logger.info('unknown format'.source_file)
                    continue

                c = 0
                skipped = 0
                inserted = 0
                for r in rows:
                    c += 1
                    if c % 1000 == 0:
                        self.logger.info('Parsed ' + str(c) +
                                         ' mutant data entries')

                    # publication
                    try:  #fix if it thinks it's float.
                        float(r['reference'])
                        r['reference'] = str(int(r['reference']))
                    except ValueError:
                        pass

                    if r['reference'].isdigit():  #assume pubmed
                        pub_type = 'pubmed'
                    else:  #assume doi
                        pub_type = 'doi'

                    try:
                        pub = Publication.objects.get(
                            web_link__index=r['reference'],
                            web_link__web_resource__slug=pub_type)
                    except Publication.DoesNotExist:
                        pub = Publication()
                        try:
                            pub.web_link = WebLink.objects.get(
                                index=r['reference'],
                                web_resource__slug=pub_type)
                        except WebLink.DoesNotExist:
                            wl = WebLink.objects.create(
                                index=r['reference'],
                                web_resource=WebResource.objects.get(
                                    slug=pub_type))
                            pub.web_link = wl

                        if pub_type == 'doi':
                            pub.update_from_doi(doi=r['reference'])
                        elif pub_type == 'pubmed':
                            pub.update_from_pubmed_data(index=r['reference'])
                        try:
                            pub.save()
                        except:
                            self.logger.error('error with reference ' +
                                              str(r['reference']) + ' ' +
                                              pub_type)
                            continue  #if something off with publication, skip.

                    if r['ligand_type'] == 'PubChem CID' or r[
                            'ligand_type'] == 'SMILES':
                        if r['ligand_type'] == 'PubChem CID':
                            pubchem_lookup_value = 'cid'
                        elif r['ligand_type'] == 'SMILES':
                            pubchem_lookup_value = 'smiles'

                        try:
                            web_resource = WebResource.objects.get(
                                slug='pubchem')
                        except:
                            # abort if pdb resource is not found
                            raise Exception(
                                'PubChem resource not found, aborting!')

                        if 'ligand_name' in r and r['ligand_name']:
                            ligand_name = str(r['ligand_name'])
                        else:
                            ligand_name = False

                        try:
                            # if this name is canonical and it has a ligand record already
                            l = Ligand.objects.get(
                                name=ligand_name,
                                canonical=True,
                                properities__web_links__web_resource=
                                web_resource,
                                properities__web_links__index=r['ligand_id'])
                        except Ligand.DoesNotExist:
                            try:
                                # if exists under different name
                                l_canonical = Ligand.objects.get(
                                    properities__web_links__web_resource=
                                    web_resource,
                                    properities__web_links__index=r[
                                        'ligand_id'],
                                    canonical=True)
                                l, created = Ligand.objects.get_or_create(
                                    properities=l_canonical.properities,
                                    name=ligand_name,
                                    canonical=False)
                                if created:
                                    self.logger.info(
                                        'Created ligand {}'.format(l.name))
                            except Ligand.DoesNotExist:
                                # fetch ligand from pubchem
                                default_ligand_type = 'Small molecule'
                                lt, created = LigandType.objects.get_or_create(
                                    slug=slugify(default_ligand_type),
                                    defaults={'name': default_ligand_type})
                                l = Ligand()
                                l = l.load_from_pubchem(
                                    pubchem_lookup_value, r['ligand_id'], lt,
                                    ligand_name)

                    elif r['ligand_name']:

                        # if this name is canonical and it has a ligand record already
                        if Ligand.objects.filter(name=r['ligand_name'],
                                                 canonical=True).exists():
                            l = Ligand.objects.get(name=r['ligand_name'],
                                                   canonical=True)

                        # if this matches an alias that only has "one" parent canonical name - eg distinct
                        elif Ligand.objects.filter(
                                name=r['ligand_name'],
                                canonical=False,
                                ambigious_alias=False).exists():
                            l = Ligand.objects.get(name=r['ligand_name'],
                                                   canonical=False,
                                                   ambigious_alias=False)

                        # if this matches an alias that only has several canonical parents, must investigate, start
                        # with empty.
                        elif Ligand.objects.filter(
                                name=r['ligand_name'],
                                canonical=False,
                                ambigious_alias=True).exists():
                            lp = LigandProperities()
                            lp.save()
                            l = Ligand()
                            l.properities = lp
                            l.name = r['ligand_name']
                            l.canonical = False
                            l.ambigious_alias = True
                            l.save()
                            l.load_by_name(r['ligand_name'])

                        # if neither a canonical or alias exists, create the records. Remember to check for
                        # canonical / alias status.
                        else:
                            lp = LigandProperities()
                            lp.save()
                            l = Ligand()
                            l.properities = lp
                            l.name = str(r['ligand_name'])
                            l.canonical = True
                            l.ambigious_alias = False
                            l.save()
                            l.load_by_name(str(r['ligand_name']))
                    else:
                        l = None

                    if Ligand.objects.filter(
                            name=r['exp_mu_ligand_ref'], canonical=True
                    ).exists(
                    ):  #if this name is canonical and it has a ligand record already
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                                   canonical=True)
                    elif Ligand.objects.filter(
                            name=r['exp_mu_ligand_ref'],
                            canonical=False,
                            ambigious_alias=False
                    ).exists(
                    ):  #if this matches an alias that only has "one" parent canonical name - eg distinct
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'],
                                                   canonical=False,
                                                   ambigious_alias=False)
                    elif Ligand.objects.filter(
                            name=r['exp_mu_ligand_ref'],
                            canonical=False,
                            ambigious_alias=True
                    ).exists(
                    ):  #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                        lp = LigandProperities()
                        lp.save()
                        l_ref = Ligand()
                        l_ref.properities = lp
                        l_ref.name = r['exp_mu_ligand_ref']
                        l_ref.canonical = False
                        l_ref.ambigious_alias = True
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                        l_ref.save()
                    elif r['exp_mu_ligand_ref']:  #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                        lp = LigandProperities()
                        lp.save()
                        l_ref = Ligand()
                        l_ref.properities = lp
                        l_ref.name = r['exp_mu_ligand_ref']
                        l_ref.canonical = True
                        l_ref.ambigious_alias = False
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                        l_ref.save()
                    else:
                        l_ref = None

                    protein_id = 0
                    residue_id = 0

                    protein = Protein.objects.filter(entry_name=r['protein'])
                    if protein.exists():
                        protein = protein.get()
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1

                    else:
                        skipped += 1
                        if r['protein'] in missing_proteins:
                            missing_proteins[r['protein']] += 1
                        else:
                            missing_proteins[r['protein']] = 1
                            self.logger.error('Skipped due to no protein ' +
                                              r['protein'])
                        continue

                    res = Residue.objects.filter(
                        protein_conformation__protein=protein,
                        sequence_number=r['mutation_pos'])
                    if res.exists():
                        res = res.get()
                    else:
                        self.logger.error('Skipped due to no residue ' +
                                          r['protein'] + ' pos:' +
                                          str(r['mutation_pos']))
                        skipped += 1
                        continue

                    if r['ligand_class']:
                        l_role, created = LigandRole.objects.get_or_create(
                            name=r['ligand_class'],
                            defaults={'slug': slugify(r['ligand_class'])[:50]
                                      })  # FIXME this should not be needed
                    else:
                        l_role = None

                    if r['exp_type']:
                        exp_type_id, created = MutationExperimentalType.objects.get_or_create(
                            type=r['exp_type'])
                    else:
                        exp_type_id = None

                    if r['exp_func']:
                        exp_func_id, created = MutationFunc.objects.get_or_create(
                            func=r['exp_func'])
                    else:
                        exp_func_id = None

                    if r['exp_mu_effect_ligand_prop'] or r[
                            'exp_mu_effect_qual']:
                        exp_qual_id, created = MutationQual.objects.get_or_create(
                            qual=r['exp_mu_effect_qual'],
                            prop=r['exp_mu_effect_ligand_prop'])
                    else:
                        exp_qual_id = None

                    if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r[
                            'opt_sign'] or r['opt_percentage'] or r[
                                'opt_qual'] or r['opt_agonist']:
                        exp_opt_id, created = MutationOptional.objects.get_or_create(
                            type=r['opt_type'],
                            wt=r['opt_wt'],
                            mu=r['opt_mu'],
                            sign=r['opt_sign'],
                            percentage=r['opt_percentage'],
                            qual=r['opt_qual'],
                            agonist=r['opt_agonist'])
                    else:
                        exp_opt_id = None

                    mutation, created = Mutation.objects.get_or_create(
                        amino_acid=r['mutation_to'],
                        protein=protein,
                        residue=res)

                    logtypes = ['pEC50', 'pIC50', 'pK']

                    foldchange = 0
                    typefold = ''
                    if r['exp_wt_value'] != 0 and r[
                            'exp_mu_value_raw'] != 0:  #fix for new format

                        if re.match("(" + ")|(".join(logtypes) + ")",
                                    r['exp_type']):  #-log values!
                            foldchange = round(
                                math.pow(10, -r['exp_mu_value_raw']) /
                                pow(10, -r['exp_wt_value']), 3)
                            typefold = r['exp_type'] + "_log"
                        else:
                            foldchange = round(
                                r['exp_mu_value_raw'] / r['exp_wt_value'], 3)
                            typefold = r['exp_type'] + "_not_log"

                        if foldchange < 1 and foldchange != 0:
                            foldchange = -round((1 / foldchange), 3)
                    elif r['fold_effect'] != 0:
                        foldchange = round(r['fold_effect'], 3)
                        if foldchange < 1:
                            foldchange = -round((1 / foldchange), 3)

                    raw_experiment = self.insert_raw(r)
                    obj, created = MutationExperiment.objects.get_or_create(
                        refs=pub,
                        protein=protein,
                        residue=res,
                        ligand=l,
                        ligand_role=l_role,
                        ligand_ref=l_ref,
                        raw=raw_experiment,
                        optional=exp_opt_id,
                        exp_type=exp_type_id,
                        exp_func=exp_func_id,
                        exp_qual=exp_qual_id,
                        mutation=mutation,
                        wt_value=r['exp_wt_value'],  #
                        wt_unit=r['exp_wt_unit'],
                        mu_value=r['exp_mu_value_raw'],
                        mu_sign=r['exp_mu_effect_sign'],
                        foldchange=foldchange)
                    mut_id = obj.id
                    inserted += 1

                self.logger.info('Parsed ' + str(c) +
                                 ' mutant data entries. Skipped ' +
                                 str(skipped))

        sorted_missing_proteins = sorted(missing_proteins.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True)
        sorted_mutants_for_proteins = sorted(mutants_for_proteins.items(),
                                             key=operator.itemgetter(1),
                                             reverse=True)

        self.logger.info('COMPLETED CREATING MUTANTS')
Пример #14
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            rows = self.data[positions[0]:]
        else:
            rows = self.data[positions[0]:positions[1]]


        missing_proteins = {}
        mutants_for_proteins = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        for r in rows:
            # print(source_file,c)
            # PRINT IF ERRORS OCCUR
            # self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try: #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    pub = Publication.objects.get(web_link__index=r['reference'], web_link__web_resource__slug=pub_type)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        wl = WebLink.objects.create(index=r['reference'],
                            web_resource = WebResource.objects.get(slug=pub_type))
                        pub.web_link = wl

                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                        continue #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        pub_review = Publication.objects.get(web_link__index=r['review'], web_link__web_resource__slug=pub_type)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)
                        except WebLink.DoesNotExist:
                            wl = WebLink.objects.create(index=r['review'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                            pub_review.web_link = wl

                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type)
                            continue #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l


            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref


            protein_id = 0
            residue_id = 0

            protein=Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein=protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            else:
                skipped += 1
                if r['protein'] in missing_proteins:
                    missing_proteins[r['protein']] += 1
                else:
                    missing_proteins[r['protein']] = 1
                    self.logger.error('Skipped due to no protein '+ r['protein'])
                continue

            res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
            if res.exists():
                res=res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                skipped += 1
                continue

            if r['ligand_class']:
                l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'],
                    defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
                exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            else:
                exp_opt_id = None

            try:
                mutation, created =  Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res)
            logtypes = ['pEC50','pIC50','pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format

                if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']):  #-log values!
                    foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3);
                    typefold = r['exp_type']+"_log"
                else:
                    foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3);
                    typefold = r['exp_type']+"_not_log"


                if foldchange<1 and foldchange!=0:
                    foldchange = -round((1/foldchange),3)
            elif r['fold_effect']!=0:
                    foldchange = round(r['fold_effect'],3);
                    if foldchange<1: foldchange = -round((1/foldchange),3);


            raw_experiment = self.insert_raw(r)
            bulk = MutationExperiment(
            refs=pub,
            review=pub_review,
            protein=protein,
            residue=res,
            ligand=l,
            ligand_role=l_role,
            ligand_ref = l_ref,
            #raw = raw_experiment, #raw_experiment, OR None
            optional = exp_opt_id,
            exp_type=exp_type_id,
            exp_func=exp_func_id,
            exp_qual = exp_qual_id,

            mutation=mutation,
            wt_value=r['exp_wt_value'], #
            wt_unit=r['exp_wt_unit'],

            mu_value = r['exp_mu_value_raw'],
            mu_sign = r['exp_mu_effect_sign'],
            foldchange = foldchange
            )
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            inserted += 1
            end = time.time()
            diff = round(end - current,2)
            #print(diff)

        self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i,me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current,2)
        current_sheet
        diff_2 = round(end - current_sheet,2)
        print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
Пример #15
0
    def main_func(self, positions, iteration,count,lock):
        # filenames
        # if not positions[1]:
        #     rows = self.data[positions[0]:]
        # else:
        #     rows = self.data[positions[0]:positions[1]]


        missing_proteins = {}
        mutants_for_proteins = {}
        wrong_uniport_ids = {}

        c = 0
        skipped = 0
        inserted = 0
        bulk_m = []
        bulk_r = []
        current_sheet = time.time()

        rows = self.data_all
        while count.value<len(rows):
            with lock:
                r = rows[count.value]
                count.value +=1 
        # for r in rows:
            # print(r['source_file'],c)
            # PRINT IF ERRORS OCCUR
            #self.logger.info('File '+str(r['source_file'])+' number '+str(c))
            current = time.time()
            c += 1
            # if c%100==0:
            #     self.logger.info('Parsed '+str(c)+' mutant data entries')

            # publication
            try: #fix if it thinks it's float.
                float(r['reference'])
                r['reference'] = str(int(r['reference']))
                float(r['review'])
                r['review'] = str(int(r['review']))
            except ValueError:
                pass

            if r['reference'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            else: #assume doi
                pub_type = 'doi'

            if r['reference'] not in self.publication_cache:
                try:
                    wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)
                except WebLink.DoesNotExist:
                    try:
                        wl = WebLink.objects.create(index=r['reference'],
                                web_resource = WebResource.objects.get(slug=pub_type))
                    except IntegrityError:
                        wl = WebLink.objects.get(index=r['reference'], web_resource__slug=pub_type)


                try:
                    pub = Publication.objects.get(web_link=wl)
                except Publication.DoesNotExist:
                    pub = Publication()
                    try:
                        pub.web_link = wl
                        pub.save()
                    except IntegrityError:
                        pub = Publication.objects.get(web_link=wl)


                    if pub_type == 'doi':
                        pub.update_from_doi(doi=r['reference'])
                    elif pub_type == 'pubmed':
                        pub.update_from_pubmed_data(index=r['reference'])
                    try:
                        pub.save()
                    except:
                        self.logger.error('error with reference ' + str(r['reference']) + ' ' + pub_type)
                        continue #if something off with publication, skip.
                self.publication_cache[r['reference']] = pub
            else:
                pub = self.publication_cache[r['reference']]

            # print(r['review'],r['reference'])
            if r['review'].isdigit(): #assume pubmed
                pub_type = 'pubmed'
            elif r['review'].startswith('http'):
                pub_type = 'raw_link'
            else: #assume doi
                pub_type = 'doi'

            # print(r['review'],pub_type)
            if r['review']:
                if r['review'] not in self.publication_cache:
                    try:
                        wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)
                    except WebLink.DoesNotExist:
                        try:
                            wl = WebLink.objects.create(index=r['review'],
                                    web_resource = WebResource.objects.get(slug=pub_type))
                        except IntegrityError:
                            wl = WebLink.objects.get(index=r['review'], web_resource__slug=pub_type)

                    try:
                        pub_review = Publication.objects.get(web_link=wl)
                    except Publication.DoesNotExist:
                        pub_review = Publication()
                        try:
                            pub_review.web_link = wl
                            pub_review.save()
                        except IntegrityError:
                            pub_review = Publication.objects.get(web_link=wl)


                        if pub_type == 'doi':
                            pub_review.update_from_doi(doi=r['review'])
                        elif pub_type == 'pubmed':
                            pub_review.update_from_pubmed_data(index=r['review'])
                        try:
                            pub_review.save()
                        except:
                            self.logger.error('error with review ' + str(r['review']) + ' ' + pub_type)
                            continue #if something off with publication, skip.
                        self.publication_cache[r['review']] = pub_review
                else:
                    pub_review = self.publication_cache[r['review']]
            else:
                pub_review = None

            l = None
            if str(r['ligand_name']) in self.ligand_cache:
                if r['ligand_id'] in self.ligand_cache[str(r['ligand_name'])]:
                    l = self.ligand_cache[str(r['ligand_name'])][r['ligand_id']]
            else:
                self.ligand_cache[str(r['ligand_name'])] = {}

            if not l:
                try:
                    l = get_or_make_ligand(r['ligand_id'],r['ligand_type'],str(r['ligand_name']))
                except Exception as msg:
                    print('Something errored with ligand, aborting entry of mutation',r['ligand_name'],r['ligand_type'],r['ligand_id'],r['source_file'])
                    print(msg)
                    traceback.print_exc()
                    continue
                self.ligand_cache[str(r['ligand_name'])][r['ligand_id']] = l


            l_ref = None
            if str(r['exp_mu_ligand_ref']) in self.ref_ligand_cache:
                l_ref = self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])]
            else:
                if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists(): #if this name is canonical and it has a ligand record already
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False).exists(): #if this matches an alias that only has "one" parent canonical name - eg distinct
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=False)
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False, ambigious_alias=True).exists(): #if this matches an alias that only has several canonical parents, must investigate, start with empty.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = False
                    l_ref.ambigious_alias = True
                    l_ref.save()
                    l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    l_ref.save()
                elif Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=False).exists(): #amigious_alias not specified
                    l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                    l_ref.ambigious_alias = False
                    l_ref.save()
                elif r['exp_mu_ligand_ref']: #if neither a canonical or alias exists, create the records. Remember to check for canonical / alias status.
                    lp = LigandProperities()
                    lp.save()
                    l_ref = Ligand()
                    l_ref.properities = lp
                    l_ref.name = r['exp_mu_ligand_ref']
                    l_ref.canonical = True
                    l_ref.ambigious_alias = False
                    try:
                        l_ref.save()
                        l_ref.load_by_name(r['exp_mu_ligand_ref'])
                    except IntegrityError:
                        if Ligand.objects.filter(name=r['exp_mu_ligand_ref'], canonical=True).exists():
                            l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=True)
                        else:
                            l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                    try:
                        l_ref.save()
                    except IntegrityError:
                        l_ref = Ligand.objects.get(name=r['exp_mu_ligand_ref'], canonical=False)
                        # print("error failing ligand, duplicate?")
                        # logger.error("FAILED SAVING LIGAND, duplicate?")
                else:
                    l_ref = None
                self.ref_ligand_cache[str(r['exp_mu_ligand_ref'])] = l_ref


            protein_id = 0
            residue_id = 0

            protein=Protein.objects.filter(entry_name=r['protein'])
            if protein.exists():
                protein=protein.get()
                if r['protein'] in mutants_for_proteins:
                    mutants_for_proteins[r['protein']] += 1
                else:
                    mutants_for_proteins[r['protein']] = 1

            elif r['protein'] not in missing_proteins:

                try:
                    r['protein'] = wrong_uniport_ids[r['protein']]
                    real_uniprot = wrong_uniport_ids[r['protein']]
                    protein=Protein.objects.get(entry_name=r['protein'])
                    # print('fetched with lookup table',r['protein'])
                except:
                    # look for it as uniprot
                    protein=Protein.objects.filter(web_links__web_resource__slug='uniprot', web_links__index=r['protein'].upper())
                    if protein.exists():
                        protein=protein.get()
                        real_uniprot = protein.entry_name
                        if r['protein'] in mutants_for_proteins:
                            mutants_for_proteins[r['protein']] += 1
                        else:
                            mutants_for_proteins[r['protein']] = 1
                    else:
                        # Try to lookup in uniprot to catch typing errors / variants in entry_name
                        url = 'http://www.uniprot.org/uniprot/$index.xml'
                        cache_dir = ['uniprot', 'id']
                        uniprot_protein = fetch_from_web_api(url, r['protein'], cache_dir, xml = True)
                        try:
                            real_uniprot = uniprot_protein.find('.//{http://uniprot.org/uniprot}name').text.lower()
                            protein=Protein.objects.get(entry_name=real_uniprot)
                        except:
                            skipped += 1
                            if r['protein'] in missing_proteins:
                                missing_proteins[r['protein']] += 1
                            else:
                                missing_proteins[r['protein']] = 1
                                # print('Skipped due to no protein '+ r['protein'])
                                self.logger.error('Skipped due to no protein '+ r['protein'])
                            continue
                    wrong_uniport_ids[r['protein']] = protein.entry_name
                    r['protein'] = real_uniprot
            else:
                missing_proteins[r['protein']] += 1
                continue


            res=Residue.objects.filter(protein_conformation__protein=protein,amino_acid=r['mutation_from'],sequence_number=r['mutation_pos']) #FIXME MAKE AA CHECK
            if res.exists():
                res=res.get()
            else:
                self.logger.error('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'])
                # print('Skipped due to no residue or mismatch AA ' + r['protein'] + ' pos:'+str(r['mutation_pos']) + ' AA:'+r['mutation_from'],r['source_file'])
                skipped += 1
                continue

            if r['ligand_class']:
                try:
                    l_role, created = LigandRole.objects.get_or_create(name=r['ligand_class'],
                        defaults={'slug': slugify(r['ligand_class'])[:50]}) # FIXME this should not be needed
                except Exception as e:
                    if LigandRole.objects.filter(slug=slugify(r['ligand_class'])[:50]).exists():
                        l_role = LigandRole.objects.get(slug=slugify(r['ligand_class'])[:50])
                        if l_role.name == slugify(r['ligand_class'])[:50]:
                            #if name of role is same as slug, then it was created by constructs script, replace it
                            l_role.name = r['ligand_class']
                            l_role.save()
                    else:
                        print(e)
                        print("Error with",r['ligand_class'],slugify(r['ligand_class'])[:50] )
                        l_role, created = LigandRole.objects.get_or_create(slug=slugify(r['ligand_class'])[:50]) # FIXME this should not be needed
            else:
                l_role = None

            if r['exp_type']:
                exp_type_id, created = MutationExperimentalType.objects.get_or_create(type=r['exp_type'])
            else:
                exp_type_id = None

            if r['exp_func']:
                exp_func_id, created = MutationFunc.objects.get_or_create(func=r['exp_func'])
            else:
                exp_func_id = None

            if r['exp_mu_effect_ligand_prop'] or r['exp_mu_effect_qual']:
                exp_qual_id, created = MutationQual.objects.get_or_create(qual=r['exp_mu_effect_qual'], prop=r['exp_mu_effect_ligand_prop'])
            else:
                exp_qual_id = None

            # if r['opt_type'] or r['opt_wt'] or r['opt_mu'] or r['opt_sign'] or r['opt_percentage'] or r['opt_qual'] or r['opt_agonist']:
            #     exp_opt_id, created =  MutationOptional.objects.get_or_create(type=r['opt_type'], wt=r['opt_wt'], mu=r['opt_mu'], sign=r['opt_sign'], percentage=r['opt_percentage'], qual=r['opt_qual'], agonist=r['opt_agonist'])
            # else:
            #     exp_opt_id = None

            try:
                mutation, created =  Mutation.objects.get_or_create(amino_acid=r['mutation_to'],protein=protein, residue=res)
            except IntegrityError:
                mutation = Mutation.objects.get(amino_acid=r['mutation_to'],protein=protein, residue=res)
            logtypes = ['pEC50','pIC50','pK']

            foldchange = 0
            typefold = ''
            if r['exp_wt_value']!=0 and r['exp_mu_value_raw']!=0: #fix for new format
                if re.match("(" + ")|(".join(logtypes) + ")", r['exp_type']):  #-log values!
                    try:
                        foldchange = round(math.pow(10,-r['exp_mu_value_raw'])/pow(10,-r['exp_wt_value']),3);
                    except:
                        print(r)
                    typefold = r['exp_type']+"_log"
                elif "%"==r['exp_wt_unit']:
                    # if % then it's a difference case, then lower value is bad. Otherwise it's conc and lower is better
                    foldchange = round(r['exp_wt_value']/r['exp_mu_value_raw'],3);
                else:
                    foldchange = round(r['exp_mu_value_raw']/r['exp_wt_value'],3);
                    typefold = r['exp_type']+"_not_log"
                if foldchange>0 and foldchange<1 and foldchange!=0:
                    foldchange = -round((1/foldchange),3)
            elif r['fold_effect']!=0:
                    foldchange = round(r['fold_effect'],3);
                    if foldchange<1: foldchange = -round((1/foldchange),3);
            r['fold_effect'] = foldchange
            
            raw_experiment = self.insert_raw(r)
            # raw_experiment.save()
            bulk = MutationExperiment(
            refs=pub,
            review=pub_review,
            submitting_group = r['submitting_group'],
            data_container = r['data_container'],
            data_container_number = r['data_container_number'],
            protein=protein,
            residue=res,
            ligand=l,
            ligand_role=l_role,
            ligand_ref = l_ref,
            # raw = raw_experiment, #raw_experiment, OR None
            # optional = exp_opt_id,
            exp_type=exp_type_id,
            exp_func=exp_func_id,
            exp_qual = exp_qual_id,

            mutation=mutation,
            wt_value=r['exp_wt_value'], #
            wt_unit=r['exp_wt_unit'],

            mu_value = r['exp_mu_value_raw'],
            mu_sign = r['exp_mu_effect_sign'],
            foldchange = foldchange,
            opt_receptor_expression = r['opt_receptor_expression'],
            opt_basal_activity = r['opt_basal_activity'],
            opt_gain_of_activity = r['opt_gain_of_activity'],
            opt_ligand_emax = r['opt_ligand_emax'],
            opt_agonist =  r['opt_agonist'],
            )
            # for line,val in r.items():
            #     val = str(val)
            #     if len(val)>100:
            #         print(line,"too long",val)
            # mut_id = obj.id
            bulk_r.append(raw_experiment)
            bulk_m.append(bulk)
            # try:
            #     bulk.save()
            # except Exception as e:
            #     print(e)
            #     print(r)
            #     break
            #print('saved ',r['source_file'])
            inserted += 1
            end = time.time()
            diff = round(end - current,2)
            #print(diff)

        self.logger.info('Parsed '+str(c)+' mutant data entries. Skipped '+str(skipped))

        current = time.time()

        raws = MutationRaw.objects.bulk_create(bulk_r)
        for i,me in enumerate(bulk_m):
            me.raw = raws[i]
        MutationExperiment.objects.bulk_create(bulk_m)
        end = time.time()
        diff = round(end - current,2)
        current_sheet
        diff_2 = round(end - current_sheet,2)
        print("overall",diff_2,"bulk",diff,len(bulk_m),"skipped",str(skipped))
        sorted_missing_proteins = sorted(missing_proteins.items(), key=operator.itemgetter(1),reverse=True)