def __init__(self, pdb_file=None, pdb_filename=None, structure=None, blast_path='blastp', blastdb=os.sep.join([ settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb' ])): # pdb_file can be either a name/path or a handle to an open file self.pdb_file = pdb_file self.pdb_filename = pdb_filename # dictionary of 'MappedResidue' object storing information about alignments and bw numbers self.residues = {} self.pdb_seq = {} #Seq('') # list of uniprot ids returned from blast self.prot_id_list = [] #setup for local blast search self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb) if self.pdb_file: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_file)[0] elif self.pdb_filename: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_filename)[0] else: self.pdb_structure = structure self.parse_structure(self.pdb_structure)
def __init__( self, pdb_file=None, pdb_filename=None, structure=None, blast_path="blastp", blastdb=os.sep.join([settings.STATICFILES_DIRS[0], "blast", "protwis_blastdb"]), ): # pdb_file can be either a name/path or a handle to an open file self.pdb_file = pdb_file self.pdb_filename = pdb_filename # dictionary of 'MappedResidue' object storing information about alignments and bw numbers self.residues = {} self.pdb_seq = {} # Seq('') # list of uniprot ids returned from blast self.prot_id_list = [] # setup for local blast search self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb) if self.pdb_file: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure("ref", self.pdb_file)[0] elif self.pdb_filename: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure("ref", self.pdb_filename)[0] else: self.pdb_structure = structure self.parse_structure(self.pdb_structure)
def __init__(self, pdb_file=None, sequence=None, wt_protein_id=None): # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers self.mapping = {} self.residues = {} self.segments = {} self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb'])) self.wt_protein_id = wt_protein_id if pdb_file is not None: self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0] # a list of SeqRecord objects retrived from the pdb SEQRES section try: self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres')) self.struct_id = self.seqres[0].id.split(':')[0] except: self.seqres = None self.struct_id = None # SeqRecord id is a pdb_code:chain self.sequence = sequence if type(sequence) == "string": self.sequence = { x: y for x,y in enumerate(sequnece) } # If not specified, attempt to get wildtype from pdb. try: if not wt_protein_id and pdb_file is not None: self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent else: raise Exception() except: if not wt_protein_id: self.wt = None self.wt_seq = '' else: self.wt = Protein.objects.get(id=wt_protein_id) self.wt_seq = str(self.wt.sequence) self.fusions = [] self.parse_pdb(self.pdb_struct) #if self.seqres: # self.map_seqres() self.mark_deletions()
def __init__ (self, pdb_file=None, pdb_filename=None, structure=None, pdb_code=None, blast_path='blastp', blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']),top_results=1, sequence_parser=False, signprot=False): # pdb_file can be either a name/path or a handle to an open file self.pdb_file = pdb_file self.pdb_filename = pdb_filename # if pdb 4 letter code is specified self.pdb_code = pdb_code # dictionary of 'MappedResidue' object storing information about alignments and bw numbers self.residues = {} self.pdb_seq = {} #Seq('') # list of uniprot ids returned from blast self.prot_id_list = [] #setup for local blast search self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb,top_results=top_results) # calling sequence parser if sequence_parser: if pdb_code: struct = Structure.objects.get(pdb_code__index=self.pdb_code) if not signprot: if pdb_code: s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=struct.protein_conformation.protein.parent.id) else: s = SequenceParser(pdb_file=self.pdb_file)#, wt_protein_id=struct.protein_conformation.protein.parent.id) else: s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=signprot.id) self.pdb_structure = s.pdb_struct self.mapping = s.mapping self.wt = s.wt else: if self.pdb_file: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_file)[0] elif self.pdb_filename: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_filename)[0] else: self.pdb_structure = structure self.parse_structure(self.pdb_structure)
def __init__(self, pdb_file, sequence=None, wt_protein_id=None): # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers self.mapping = {} self.residues = {} self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb'])) self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0] # a list of SeqRecord objects retrived from the pdb SEQRES section self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres')) # SeqRecord id is a pdb_code:chain self.struct_id = self.seqres[0].id.split(':')[0] # If not specified, attempt to get wildtype from pdb. if not wt_protein_id: self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent else: self.wt = Protein.objects.get(id=wt_protein_id) self.wt_seq = str(self.wt.sequence) self.fusions = [] self.parse_pdb(self.pdb_struct)
def post(self, request, *args, **kwargs): if 'human' in request.POST.keys(): blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']), top_results=50) blast_out = blast.run(request.POST['input_seq']) else: blast = BlastSearch(top_results=50) blast_out = blast.run(request.POST['input_seq']) context = {} context['results'] = [(Protein.objects.get(pk=x[0]), x[1]) for x in blast_out] context["input"] = request.POST['input_seq'] return render(request, self.template_name, context)
def handle(self, *args, **options): blastdb = None if options['d']: blastdb = options['d'] ### FIXME import/parse blast db else: blastdb = 'blastp_out.fasta' if options['make_db']: if len(options['make_db'])>1: prots = Protein.objects.filter(entry_name__in=options['make_db']) ### FIXME elif len(options['make_db'])==1: prots = [] fasta = '' ### xtal preset if options['make_db']==['xtal']: structs = Structure.objects.all() for i in structs: if i.protein_conformation.protein.parent not in prots: prots.append(i.protein_conformation.protein.parent) fasta+='>{}\n{}\n'.format(i.protein_conformation.protein.parent.entry_name, i.protein_conformation.protein.parent.sequence) elif options['make_db']==['all']: receptor_fams = ProteinFamily.objects.filter(name__startswith='Class') prots = Protein.objects.filter(accession__isnull=False, family__parent__parent__parent__in=receptor_fams) for i in prots: fasta+='>{}\n{}\n'.format(i.entry_name, i.sequence) else: fasta+='>{}\n{}\n'.format('single input', options['make_db'][0]) with open('./blastp_out.fasta','w') as f: f.write(fasta) make_db_command = shlex.split('makeblastdb -in blastp_out.fasta -dbtype prot -parse_seqids') subprocess.call(make_db_command) if options['q']: for q in options['q']: if blastdb: bs = BlastSearch(blastdb=blastdb, top_results=1) out = bs.run(q) for o in out: print(o[0]) print(o[1]) else: bs = BlastSearch() out = bs.run(q) for o in out: for i in o: print(i)
def main_func(self, positions, iteration, count, lock): self.logger.info('CREATING OTHER PROTEINS') try: # go through constructs and finding their entry_names for lookup construct_entry_names = [] self.logger.info('Getting construct accession codes') filenames = os.listdir(self.construct_data_dir) for source_file in filenames: source_file_path = os.sep.join( [self.construct_data_dir, source_file]) self.logger.info( 'Getting protein name from construct file {}'.format( source_file)) split_filename = source_file.split(".") extension = split_filename[1] if extension != 'yaml': continue # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # check whether protein is specified if 'protein' not in sd: continue # append entry_name to lookup list construct_entry_names.append(sd['protein']) # parse files filenames = os.listdir(self.local_uniprot_dir) # Keep track of first or second iteration reviewed = ['SWISSPROT', 'TREMBL'][iteration - 1] skipped_due_to_swissprot = 0 # for i,source_file in enumerate(filenames): while count.value < len(filenames): with lock: source_file = filenames[count.value] count.value += 1 # if i<positions[0]: #continue if less than start # continue # if positions[1]: #if end is non-false # if i>=positions[1]: # #continue if i less than process # continue source_file_name = os.sep.join( [self.local_uniprot_dir, source_file]) split_filename = source_file.split(".") accession = split_filename[0] extension = split_filename[1] if extension != 'txt': continue up = self.parse_uniprot_file(accession) # Skip TREMBL on first loop, and SWISSPROT on second if reviewed != up['source']: continue # skip human proteins if 'species_latin_name' in up and up[ 'species_latin_name'] == 'H**o sapiens': continue # should proteins that are not constructs be skipped? if self.constructs_only and up[ 'entry_name'] not in construct_entry_names: continue # is this an ortholog of a human protein? ortholog = False # is there already an entry for this protein? try: p = Protein.objects.get(entry_name=up['entry_name']) if "SWISSPROT" == up['source']: pass # print(up['entry_name'], "already there?", accession ) continue except Protein.DoesNotExist: p = None # get human ortholog using gene name for gene in up['genes']: try: g = Gene.objects.get(name__iexact=gene, species__common_name="Human", position=0) ps = g.proteins.all().order_by('id') p = ps[0] ortholog = True self.logger.info("Human ortholog found: {}".format( p.entry_name)) break except Gene.DoesNotExist: self.logger.info( "No gene found for {}".format(gene)) continue # if gene name not found, try using entry name if not p: split_entry_name = up['entry_name'].split('_') # add _ to the split entry name to avoid e.g. gp1 matching gp139 entry_name_query = split_entry_name[0] + '_' try: p = Protein.objects.get( entry_name__startswith=entry_name_query, species__common_name="Human") ortholog = True self.logger.info("Human ortholog found: {}".format( p.entry_name)) except Protein.DoesNotExist: self.logger.info("No match found for {}".format( entry_name_query)) # check whether the entry name is in the construct list if not p and up['entry_name'] in construct_entry_names: # BLAST sequence to find closest hit (for reference positions) blast = BlastSearch() blast_out = blast.run(up['sequence']) # use first hit from BLAST as template for reference positions try: p = Protein.objects.get(pk=blast_out[0][0]) except Protein.DoesNotExist: print('Template protein for {} not found'.format( up['entry_name'])) self.logger.error( 'Template protein for {} not found'.format( up['entry_name'])) # skip if no ortholog is found FIXME use a profile to find a good template if not p: continue # check whether an entry already exists for this protein/species # Skips unreviewed genes that have a matching SWISPROT - Some human orthologues # can have several orthologues from same species. Eg: agtra_rat and agtrb_rat for AGTR1_HUMAN already_entry_names = list( Protein.objects.filter( family=p.family, species__common_name=up['species_common_name'], source__name="SWISSPROT").exclude( entry_name=up['entry_name']).values_list( 'entry_name', flat=True)) if "SWISSPROT" != up['source'] and len(already_entry_names): # print(up['entry_name'], accession, " swissprot already there?",p.family.slug, p, p.accession ) skipped_due_to_swissprot += 1 continue elif len(already_entry_names): self.logger.error( "{} {} swissprot orthologue already there? {}".format( up['entry_name'], accession, already_entry_names)) # # check whether reference positions exist for this protein, and find them if they do not # ref_position_file_path = os.sep.join([self.ref_position_source_dir, up['entry_name'] + '.yaml']) # auto_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, up['entry_name'] + '.yaml']) # if not os.path.isfile(ref_position_file_path): # # look for the file in the automatically generated reference file dir # if not os.path.isfile(auto_ref_position_file_path): # # get reference positions of human ortholog # template_ref_position_file_path = os.sep.join([self.ref_position_source_dir, # p.entry_name + '.yaml']) # if not os.path.isfile(template_ref_position_file_path): # # use a non human sequence # template_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, # p.entry_name + '.yaml']) # ref_positions = align_protein_to_reference(up, template_ref_position_file_path, p) # # write reference positions to a file # with open(auto_ref_position_file_path, "w") as auto_ref_position_file: # yaml.dump(ref_positions, auto_ref_position_file, default_flow_style=False) # create a database entry for the protein if ortholog: # for orthologs, use properties from the human protein self.create_protein(p.name, p.family, p.sequence_type, p.residue_numbering_scheme, accession, up) else: # otherwise, create a new family, and use Uniprot name top_level_parent_family = ProteinFamily.objects.get( slug=p.family.slug.split('_')[0]) num_families = ProteinFamily.objects.filter( parent=top_level_parent_family).count() family_slug = top_level_parent_family.slug + "_" + str( num_families + 1).zfill(3) other_family, created = ProteinFamily.objects.get_or_create( parent=top_level_parent_family, name='Other', defaults={'slug': family_slug}) if created: self.logger.info( 'Created protein family {}'.format(other_family)) family_slug += '_001' unclassified_family, created = ProteinFamily.objects.get_or_create( parent=other_family, name='Unclassified', defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format( unclassified_family)) num_families = ProteinFamily.objects.filter( parent=unclassified_family).count() family_slug = unclassified_family.slug + "_" + str( num_families + 1).zfill(3) pf, created = ProteinFamily.objects.get_or_create( parent=unclassified_family, name=up['genes'][0], defaults={'slug': family_slug}) if created: self.logger.info( 'Created protein family {}'.format(pf)) self.create_protein(up['genes'][0], pf, p.sequence_type, p.residue_numbering_scheme, accession, up) self.logger.info('COMPLETED CREATING OTHER PROTEINS') except Exception as msg: print(msg) self.logger.error(msg) PrintException()
class SequenceParser(object): """ Class mapping the pdb, pdb_seqres, wildtype and any given sequence onto wt using blast with human sequences database. It produces a report with missing, mutated and inserted residues. """ residue_list = [ "ARG", "ASP", "GLU", "HIS", "ASN", "GLN", "LYS", "SER", "THR", "HIS", "HID", "PHE", "LEU", "ILE", "TYR", "TRP", "VAL", "MET", "PRO", "CYS", "ALA", "GLY" ] def __init__(self, pdb_file=None, sequence=None, wt_protein_id=None): # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers self.mapping = {} self.residues = {} self.segments = {} self.blast = BlastSearch(blastdb=os.sep.join( [settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb'])) if pdb_file is not None: self.pdb_struct = PDBParser(QUIET=True).get_structure( 'pdb', pdb_file)[0] # a list of SeqRecord objects retrived from the pdb SEQRES section try: self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres')) except: self.seqres = None # SeqRecord id is a pdb_code:chain self.struct_id = self.seqres[0].id.split(':')[0] self.sequence = sequence if type(sequence) == "string": self.sequence = {x: y for x, y in enumerate(sequnece)} # If not specified, attempt to get wildtype from pdb. if not wt_protein_id and pdb_file is not None: self.wt = Structure.objects.get( pdb_code__index=self.struct_id ).protein_conformation.protein.parent else: self.wt = Protein.objects.get(id=wt_protein_id) self.wt_seq = str(self.wt.sequence) self.fusions = [] self.parse_pdb(self.pdb_struct) #if self.seqres: # self.map_seqres() self.mark_deletions() def parse_pdb(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ wt_resi = list( Residue.objects.filter(protein_conformation__protein=self.wt.id)) for chain in pdb_struct: self.residues[chain.id] = [] self.mapping[chain.id] = { x.sequence_number: ParsedResidue( x.amino_acid, x.sequence_number, str(x.display_generic_number) if x.display_generic_number else None, x.protein_segment) for x in wt_resi } for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname.replace('HID', 'HIS') not in self.residue_list: continue self.residues[chain.id].append(res) poly = self.get_chain_peptides(chain.id) for peptide in poly: #print("Start: {} Stop: {} Len: {}".format(peptide[0].id[1], peptide[-1].id[1], len(peptide))) self.map_to_wt_blast(chain.id, peptide, None, int(peptide[0].id[1])) def get_segments(self): #get the first chain c = list(self.mapping.keys())[0] for segment in ProteinSegment.objects.all(): resi = [] for r in Residue.objects.filter( protein_conformation__protein=self.wt.id, protein_segment=segment): if self.mapping[c][r.sequence_number].resnum is not None: resi.append(self.mapping[c][r.sequence_number].resnum) if resi == []: continue self.segments[segment.slug] = [min(resi), max(resi)] return self.segments def get_chain_peptides(self, chain_id, gap_threshold=230): """ Get peptides of sequential residue numbers (with respect to 230 aa gaps). The maximum length of ICL3 is 230 aa, and fusion proteins usualy have significantly different numbers, i.e. exceeding the 230 gap between TM5 and 6. The maximum allowed gap size can be evaluated automaticaly, but it is fairly costly: max([len(Residue.objects.filter(protein_segment=11, protein_conformation__protein=x)) for x in Protein.objects.filter(species=1)]) """ rnumbers = [int(x.id[1]) for x in self.residues[chain_id]] last_idx = len(rnumbers) - 1 peptides = [] tmp = [] for i, rnum in enumerate(rnumbers): if i == last_idx: #FIXME: Assuming that very last residue is actualy continuation of a chain tmp.append(self.residues[chain_id][i]) peptides.append(tmp) break if rnumbers[i + 1] != rnum + 1 and abs( rnum + 1 - rnumbers[i + 1]) > gap_threshold: tmp.append(self.residues[chain_id][i]) peptides.append(tmp) tmp = [] else: tmp.append(self.residues[chain_id][i]) return peptides def get_chain_sequence(self, chain): """ Returns a sequence string of a given chain. """ return "".join([ polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in self.residues[chain] if x.resname in self.residue_list ]) def get_peptide_sequence(self, residues): """ Returns a sequence string of a given list of Bio.PDB.Residue objects. """ return "".join([ polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in residues if x.resname in self.residue_list ]) def find_nonredundant_chains(self): """ Returns a list of nonidentical chains. """ nrc = [] if len(self.mapping.keys()) == 1: return self.mapping.keys() for r_chain in self.mapping.keys(): for chain in self.mapping.keys(): if r_chain == chain: continue if self.mapping[r_chain] != self.mapping[chain]: nrc.append(r_chain) return nrc def map_to_wt_blast(self, chain_id, residues=None, sequence=None, starting_aa=1, seqres=False): if residues: seq = self.get_peptide_sequence(residues) elif sequence: seq = sequence else: seq = self.get_chain_sequence(chain_id) alignments = self.blast.run(seq) for alignment in alignments: if alignment[1].hsps[0].expect > .5 and residues: self.fusions.append(AuxProtein(residues)) #The case when auxiliary protein is in a separate chain if self.get_chain_sequence( chain_id) == self.get_peptide_sequence(residues): del self.mapping[chain_id] continue if self.wt.id != int(alignment[0]): continue for hsps in alignment[1].hsps: self.map_hsps(hsps, chain_id, starting_aa, seqres) def map_hsps(self, hsps, chain_id, offset=1, seqres=False): """ Analyzes the High Similarity Protein Segment. """ q = hsps.query sbjct = hsps.sbjct sbjct_counter = hsps.sbjct_start q_counter = hsps.query_start for s, q in zip(sbjct, q): if s == q: if seqres: self.mapping[chain_id][sbjct_counter].set_seqres(True) else: self.mapping[chain_id][sbjct_counter].set_pdb_res_num( offset - 1 + q_counter) sbjct_counter += 1 q_counter += 1 elif s != '-' and q != '-': self.mapping[chain_id][sbjct_counter].set_pdb_res_num( offset - 1 + q_counter) self.mapping[chain_id][sbjct_counter].set_mutation(q) sbjct_counter += 1 q_counter += 1 elif s == '-' and q != '-': self.mapping[chain_id][offset - 1 + q_counter].set_insertion(q) sbjct_counter += 1 q_counter += 1 elif s != '-' and q == '-': self.mapping[chain_id][sbjct_counter].set_deletion() sbjct_counter += 1 q_counter += 1 def map_to_wt_pw(self, chain_id, residues=None, sequence=None, starting_aa=1): """ @param sequence: a dictionary of residue number: residue one letter code pairs """ if residues: seq = self.get_chain_sequence(residues) elif sequence: seq = sequence.values() else: return wt, chain_seq, score, start, end = pairwise2.align.localms( self.wt_seq, seq, 2, -4, -4, -.1, one_alignment_only=True)[0] offset = 0 for w, c in zip(wt, chain_seq): if w == c: if seqres: self.mapping[chain.id][starting_aa + offset].seqres = True r = Residue.objects.get( sequence_number=offset + self.wt_seq_start, protein_conformation__protein=self.wt.id) if r.display_generic_number is not None: self.mapping[chain_id][starting_aa + offset].add_gpcrdb( r.display_generic_number) offset += 1 elif c == '-' and w != '-': print(offset) self.mapping[chain_id][starting_aa + offset].add_deletion() elif w != '-' and c != '-' and w != c: self.mapping[chain_id][starting_aa + offset].add_mutation(c) offset += 1 elif w == '-' and c != '-': self.mapping[chain_id][starting_aa + offset].add_insertion(c) offset += 1 def map_seqres(self): for sr in self.seqres: self.map_to_wt_blast(sr.annotations['chain'], sequence=sr.seq, seqres=True) def mark_deletions(self): for chain in self.mapping.keys(): for num, res in self.mapping[chain].items(): if res.resnum is None: res.set_deletion() def get_mapping_dict(self, pdb_keys=False, seqres=False): if pdb_keys: return { x: { y: self.mapping[x][y].seqres if seqres else self.mapping[x][y].resnum for y in self.mapping[x].keys() } for x in self.mapping.keys() } else: if seqres: return { x: { y: self.mapping[x][y].resnum if self.mapping[x][y].seqres else '-' for y in self.mapping[x].keys() } for x in self.mapping.keys() } else: return { x: { y: self.mapping[x][y].resnum for y in self.mapping[x].keys() } for x in self.mapping.keys() } def get_fusions(self): if self.fusions == []: return {} fusion_dict = OrderedDict({"auxiliary": {}}) count = 1 for fusion in self.fusions: fusion_dict["auxiliary"]["aux{}".format(count)] = fusion.get_info() return fusion_dict def get_deletions(self): deletions_list = [] for chain in self.find_nonredundant_chains(): deletions = [ x for x, y in self.mapping[chain].items() if y.deletion ] deletion = deletions.reverse() tmp = [] #for num, res in self.mapping[chain].items(): # if res.deletion: # tmp.append(num) first = 0 prev = 0 while deletions != []: x = deletions.pop() #print("{}\t{}\t{}".format(x, first, prev)) if first == 0: tmp.append(x) first = x continue if prev == 0: tmp.append(x) prev = x continue if abs(x - prev) == 1: tmp.append(x) prev = x else: deletions_list.append( OrderedDict({ "start": min(tmp), "end": max(tmp), "type": "single" if len(tmp) == 1 else "range", "chain": chain })) tmp = [x] first = x prev = x deletions_list.append( OrderedDict({ "start": min(tmp), "end": max(tmp), "type": "single" if len(tmp) == 1 else "range", "chain": chain })) return {"deletions": deletions_list} def get_mutations(self): mutations_list = [] for chain in self.find_nonredundant_chains(): for num, res in self.mapping[chain].items(): if res.mutation: mutations_list.append( OrderedDict({ "wt": res.name, "mut": res.mutation, "pos (wt)": num, "pos (pdb)": res.resnum, "chain": chain })) return {"mutations": mutations_list} def get_report(self): for chain in sorted(self.mapping.keys()): print("Chain {}".format(chain)) for res in sorted(self.mapping[chain].keys()): print(self.mapping[chain][res]) def save_excel_report(self, file_name): workbook = xlsxwriter.Workbook(file_name) for chain in sorted(self.mapping.keys()): worksheet = workbook.add_worksheet(chain) worksheet.write_row(0, 0, [ "Protein number", "Residue name", "PDB number", "Generic number", "Mutation", "SEQRES" ]) row_id = 1 for res in sorted(self.mapping[chain].keys()): tmp = self.mapping[chain][res] worksheet.write_row(row_id, 0, tmp.get_param_list()) row_id += 1 workbook.close()
def handle(self, *args, **options): ## Prepare comparasion info ## filepath = 'protein/data/Isoform_annotation_table.txt' lmb_data = OrderedDict() total_lmb_isoforms = 0 all_lmb_isoforms = [] with open(filepath, "r", encoding='UTF-8') as f: for i,row in enumerate(f): if i>0: c = row.split("\t") entry_name = "{}_human".format(c[1].lower()) transcripts = c[4].split(", ") if not entry_name in lmb_data: lmb_data[entry_name] = [] lmb_data[entry_name] += transcripts total_lmb_isoforms += 1 all_lmb_isoforms += transcripts print('all_lmb_isoforms',len(all_lmb_isoforms),'distinct',len(set(all_lmb_isoforms))) ## Get parsed gtex annotation with open('protein/data/matched_gtex.json') as json_file: gtex_old = json.load(json_file) ## Need to rewrite these entries, as ensembl doesnt use the . for transcripts gtex = {} for key, val in gtex_old['transcripts'].items(): t,g = key.split("_") new_key = "{}_{}".format(t.split(".")[0],g) gtex[new_key] = val # del gtex[new_key]['subjects'] ## Url API to map genename to ensemble ID cache_dir_genes = ['gtexportal', 'gene_lookup'] url_gene = 'https://gtexportal.org/rest/v1/reference/gene?geneId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19&pageSize=250&format=json' ## Url to lookup ensemble ID to find transcripts cache_dir_transcripts_gtex = ['gtexportal', 'transcripts'] url_transcripts = 'https://gtexportal.org/rest/v1/reference/transcript?gencodeId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19' cache_dir_transcripts = ['ensembl37', 'transcripts'] url_ensembl = 'https://grch37.rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json' cache_dir_gtex_expression = ['gtexportal', 'expression_data'] url_expression = 'https://gtexportal.org/rest/v1/expression/medianTranscriptExpression?datasetId=gtex_v7&gencodeId=$index&format=json' ## Url to lookup sequence of transcript cache_dir_seq = ['ensembl37', 'seq_protein'] url_ensembl_seq = 'https://grch37.rest.ensembl.org/sequence/id/$index?content-type=application/json;type=protein' # Get all human GPCRs ps = Protein.objects.filter(sequence_type__slug='wt', species__common_name="Human", family__slug__startswith='00').all().prefetch_related('genes').order_by('entry_name') isoforms = {} total_transcripts = 0 total_transcript_skipped_no_tissue=0 total_proteins_with_isoforms = 0 gene_to_ensembl = {} transcripts_ids_total = set() transcripts_ids_skipped_total = set() total_fetched_transcripts = 0 canonical_disagreement_count = 0 total_new_transcripts = [] total_not_found = [] total_not_found_due_to_skipped = [] new_proteins = set() lmb_compare_sequences = [0,0,0] # correct, wrong, not exists in lmb sequence_lookup = {} ## COMPARE SEQUENCES filenames = os.listdir("protein/data/LMB_sequences/") all_lmb_sequences= {} for f in filenames: with open ("protein/data/LMB_sequences/"+f, "r") as myfile: fasta=myfile.read().splitlines() for i,l in enumerate(fasta): if l[0]==">": e_id = l[2:] continue if e_id in all_lmb_sequences: print('already there!',e_id) if i>2: all_lmb_sequences[e_id]=l print('all_lmb_sequences',len(all_lmb_sequences)) f = open("protein/data/20190726_transcripts.fa", "w") missing_sequences = 0 total_lmb_sequences = 0 sequences_lookup = defaultdict(list) for p,ts in lmb_data.items(): seq = Protein.objects.get(entry_name=p).sequence sequences_lookup[seq].append([p,p]) # print(p,ts) # print(seq) f.write(">{} GPCRdb sequence reference\n".format(p)) f.write("{}\n".format(seq)) seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p) lmb_sequences = {} try: with open (seq_filename, "r") as myfile: #fasta_raw = myfile.read() fasta=myfile.read().splitlines() for i,l in enumerate(fasta): if l[0]==">": e_id = l[2:] continue lmb_sequences[e_id]=l if i>2: total_lmb_sequences += 1 except: #print('No file for',p,' So no sequence for',ts) missing_sequences += len(ts) for t in ts: if not t in lmb_sequences: #print('missing ',t,'in',"{}_nonstrict_transcripts.fa".format(p)) missing_sequences += 1 seq = fetch_from_web_api(url_ensembl_seq, t,cache_dir_seq)['seq'] sequences_lookup[seq].append([t,p]) if t in lmb_sequences: if seq!=lmb_sequences[t]: print(t,'different from LBM - length ensembl:',len(seq),"length lmb:",len(lmb_sequences[t])) f.write(">{} ({})\n".format(t,p)) f.write("{}\n".format(seq)) f.close() print('total missing sequences',missing_sequences) print('total lmb transcript sequences provided',total_lmb_sequences) print('total lmb protein',len(lmb_data)) #return for seq,ts in sequences_lookup.items(): if len(ts)>1: print('Identical sequence:',ts) sequences_lookup = defaultdict(list) all_transcript_seq = {} for p in ps:# .filter(entry_name='gpc5b_human').all(): transcripts = [] transcripts_ids = [] transcripts_ids_skipped = [] ensembl_transcripts_count = 0 genes = list(p.genes.all().values_list('name',flat=True)) uniprot = p.accession canonical = '' canon_seq = p.sequence # sequence_lookup[canon_seq] = p.entry_name grch37_canonical_seq = '' uniprot_canonical = '' grch37_canonical = '' # print(">" + p.entry_name,uniprot, 'genes:',genes) seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p.entry_name) lmb_sequences = {} try: with open (seq_filename, "r") as myfile: #fasta_raw = myfile.read() fasta=myfile.read().splitlines() for l in fasta: if l[0]==">": e_id = l[2:] continue lmb_sequences[e_id]=l except: pass #break alternative_ids_uniprot = self.find_ensembl_id_by_uniprot(uniprot) # print(alternative_ids_uniprot) ensembl_gene_id = [] for gene in genes: if not gene: continue gene_lookup = fetch_from_web_api(url_gene, gene, cache_dir_genes) # try: same_gene_id = '' if gene_lookup['gene']: for gene_info in gene_lookup['gene']: if gene_info['geneSymbol']==gene: ensembl_gene_id.append(gene_info['gencodeId']) if len(ensembl_gene_id)>1: print(ensembl_gene_id,'MORE THAN 1 !!!!') if len(ensembl_gene_id)==0: print('No ID found, using uniprot') if alternative_ids_uniprot['genes']: ensembl_gene_id = alternative_ids_uniprot['genes'][0] else: print("NO ID FOR THIS RECEPTOR") continue else: ensembl_gene_id = ensembl_gene_id[0] #alternative_id = self.find_ensembl_id(gene) # alternative_id_uniprot = self.find_ensembl_id_by_uniprot(uniprot) # print(ensembl_gene_id,alternative_ids_uniprot) # expression = fetch_from_web_api(url_expression,ensembl_gene_id,cache_dir_gtex_expression) # print(expression) # go through expression # expressed_transcripts = {} # for e in expression['medianTranscriptExpression']: # if e['median']>0 or 1==1: # #only if expression # t_id = e['transcriptId'] # t_short = t_id.split(".")[0] # tissue = e['tissueSiteDetailId'] # if t_short not in expressed_transcripts: # expressed_transcripts[t_short] = {'long':t_id,'tissues':[], 'max_median':0} # if expressed_transcripts[t_short]['max_median']<e['median']: # expressed_transcripts[t_short]['max_median'] = e['median'] # expressed_transcripts[t_short]['tissues'].append([tissue,e['median']]) # print(expressed_transcripts) # print(ensembl_gene_id) gene_to_ensembl[p.entry_name] = ensembl_gene_id # print("E_ID: " +ensembl_gene_id,alternative_ids_uniprot) # ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts) # use uniprot gene ID instead ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts) # print(ensembl_gene_id) if (alternative_ids_uniprot['genes'] and ensembl_gene_id.split(".")[0]!=alternative_ids_uniprot['genes'][0]): print("##### ensembl gene id changed",ensembl_gene_id,alternative_ids_uniprot['genes'][0]) #total_fetched_transcripts += len(ensembl_transcripts['Transcript']) # print(ensembl_transcripts) same_gene_id = True if not ensembl_transcripts: print('error',alternative_ids_uniprot,ensembl_gene_id) same_gene_id = False ensembl_transcripts = fetch_from_web_api(url_ensembl, alternative_ids_uniprot['genes'][0], cache_dir_transcripts) for t in ensembl_transcripts['Transcript']: ensembl_transcripts_count += 1 display_name = t['display_name'] is_canonical = t['is_canonical'] biotype = t['biotype'] t_id = t['id'] # # Skip canonical entries # continue # Only interested in protein_coding if biotype=='protein_coding': total_fetched_transcripts += 1 key = '{}_{}'.format(t_id,ensembl_gene_id) if not key in gtex: # print('t_id', t_id, 'not in expressed_transcripts') total_transcript_skipped_no_tissue += 1 transcripts_ids_skipped_total.add(t_id) transcripts_ids_skipped.append(t_id) continue if gtex[key]["count"]<3: total_transcript_skipped_no_tissue += 1 transcripts_ids_skipped_total.add(t_id) transcripts_ids_skipped.append(t_id) continue length = t['Translation']['length'] seq_id = t['Translation']['id'] transcript_info = OrderedDict([('display_name',display_name),('t_id',t_id),('length',length), ('seq_id',seq_id), ('expressed',gtex[key])]) seq = fetch_from_web_api(url_ensembl_seq, seq_id,cache_dir_seq) if is_canonical: grch37_canonical = t_id transcript_info['grch37_canonical'] = True grch37_canonical_seq = seq['seq'] if seq['seq']==canon_seq: uniprot_canonical = t_id transcript_info['uniprot_canonical'] = True continue # Skip canonical entries sequences_lookup[seq['seq']].append([t_id,p.entry_name]) all_transcript_seq[t_id] = seq['seq'] if seq['seq'] in sequence_lookup: print('SEQUENCE ALREADY SEEN',t_id, sequence_lookup[seq['seq']]) continue sequence_lookup[seq['seq']] = t_id transcript_info['seq'] = seq['seq'] if not t_id in lmb_sequences: transcript_info['lmb_sequences'] = False lmb_compare_sequences[2] += 1 else: if lmb_sequences[t_id]==seq['seq']: transcript_info['lmb_sequences'] = True lmb_compare_sequences[0] += 1 else: transcript_info['lmb_sequences'] = lmb_sequences[t_id] lmb_compare_sequences[1] += 1 if t_id in alternative_ids_uniprot['transcripts']: transcript_info['in_uniprot'] = True else: transcript_info['in_uniprot'] = False if p.entry_name in lmb_data and t_id in lmb_data[p.entry_name]: transcript_info['in_lmb'] = True else: transcript_info['in_lmb'] = False if t_id not in transcripts_ids: transcripts.append(transcript_info) transcripts_ids.append(t_id) transcripts_ids_total.add(t_id) total_transcripts += 1 # except: # print('Error fetching ensemble_gene_id for gene',gene) # pass not_found = [] not_found_due_to_skipped = [] if p.entry_name in lmb_data: for t in lmb_data[p.entry_name]: if t not in transcripts_ids: if t in transcripts_ids_skipped: f = open("protein/data/20190726_skipped_due_to_gtex.txt", "a") not_found_due_to_skipped.append(t) key = '{}_{}'.format(t,ensembl_gene_id) if not key in gtex: reason = 'Not in GTEX' else: reason = 'Subjects low in GTEX - count is {} - subject ids {}'.format(gtex[key]['count'],", ".join(gtex[key]['subjects'])) f.write("{}: {}\n".format(t,reason)) f.close() # print(t) else: not_found.append(t) total_not_found += not_found total_not_found_due_to_skipped += not_found_due_to_skipped new = [] for t in transcripts_ids: if p.entry_name in lmb_data and t in lmb_data[p.entry_name]: pass else: ts_check = sequences_lookup[all_transcript_seq[t]] for t_check in ts_check: if p.entry_name in lmb_data and t_check in lmb_data[p.entry_name]: print('found via duplicate',t_check,t) continue key = '{}_{}'.format(t,ensembl_gene_id) #blast = BlastSearch(top_results=2) blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']), top_results=2) blast_out = blast.run(all_transcript_seq[t]) result = [(Protein.objects.get(pk=x[0]).entry_name, x[1].hsps[0].expect) for x in blast_out] #print(result) if result: if result[0][0]==p.entry_name and result[0][1]<0.05: f = open("protein/data/20190726_new_transcripts_for_consideration.txt", "a") reason = 'GTEX count: {}'.format(gtex[key]['count']) f.write(">{} ({}): {}\n".format(t,p.entry_name,reason)) f.write("{}\n".format(all_transcript_seq[t])) f.close() new.append(t) if p.entry_name in lmb_data: new_proteins.add(p.entry_name) else: print('bad blast match',result) else: print('bad blast match',result) total_new_transcripts += new # print(len(alternative_ids_uniprot['transcripts']), 'uniprot transcripts found',ensembl_transcripts_count, ' ensembl transcripts found',len(transcripts), 'transcripts kept after filtering') # Add if transcripts found if len(transcripts): isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical} if grch37_canonical_seq!=canon_seq: isoforms[p.entry_name]['canonical_disagreement'] = True canonical_disagreement_count += 1 # isoforms[p.entry_name].append(alternative_ids_uniprot) # isoforms[p.entry_name].append(not_found) total_proteins_with_isoforms += 1 else: isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical} # break f = open('protein/data/all_isoforms_gtex.json', 'w') json.dump(isoforms,f, indent=4, separators=(',', ': ')) #break for seq,ts in sequences_lookup.items(): if len(ts)>1: print('identical sequence',ts) for t in total_not_found: ts_check = sequences_lookup[all_transcript_seq[t]] found = False for t_check in ts_check: if t_check[0] not in total_not_found: print(t,'found but under another id',t_check[0]) found = True if not found: print('##',t,'in LMB but not in this search') # print small summary results print('total_proteins_searched',len(ps)) print('total_proteins_with_isoforms', total_proteins_with_isoforms) print('Total transcripts deemed to be isoforms',total_transcripts) print('Amount of these not in LMB data',len(total_new_transcripts)) print(new_proteins) # print('Amount in LBM not found',len(total_not_found)) # print(total_not_found) print('Amount in LBM found but skipped due to GTEX data',len(total_not_found_due_to_skipped)) print(total_not_found_due_to_skipped) print('Sequence compare to LMB', lmb_compare_sequences) print('canonical_disagreement_count',canonical_disagreement_count) print(total_not_found) # print('total_transcript_skipped_no_tissue',total_transcript_skipped_no_tissue) # print('total_transcript_skipped_no_tissue2 ',len(transcripts_ids_skipped_total)) # print('total_fetched_transcripts',total_fetched_transcripts) # print(gene_to_ensembl) # save to file f = open('protein/data/all_isoforms_gtex.json', 'w') json.dump(isoforms,f, indent=4, separators=(',', ': '))
def create_orthologs(self, constructs_only): self.logger.info('CREATING OTHER PROTEINS') # go through constructs and finding their entry_names for lookup construct_entry_names = [] self.logger.info('Getting construct accession codes') filenames = os.listdir(self.construct_data_dir) for source_file in filenames: source_file_path = os.sep.join( [self.construct_data_dir, source_file]) self.logger.info( 'Getting protein name from construct file {}'.format( source_file)) split_filename = source_file.split(".") extension = split_filename[1] if extension != 'yaml': continue # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # check whether protein is specified if 'protein' not in sd: continue # append entry_name to lookup list construct_entry_names.append(sd['protein']) # parse files filenames = os.listdir(self.local_uniprot_dir) for source_file in filenames: source_file_name = os.sep.join( [self.local_uniprot_dir, source_file]) split_filename = source_file.split(".") accession = split_filename[0] extension = split_filename[1] if extension != 'txt': continue up = self.parse_uniprot_file(accession) # skip human proteins if 'species_latin_name' in up and up[ 'species_latin_name'] == 'H**o sapiens': continue # should proteins that are not constructs be skipped? if constructs_only and up[ 'entry_name'] not in construct_entry_names: continue # is this an ortholog of a human protein? ortholog = False # is there already an entry for this protein? try: p = Protein.objects.get(entry_name=up['entry_name']) continue except Protein.DoesNotExist: p = None # get human ortholog using gene name for gene in up['genes']: try: g = Gene.objects.get(name__iexact=gene, species__id=1, position=0) ps = g.proteins.all().order_by('id') p = ps[0] ortholog = True self.logger.info("Human ortholog found: {}".format( p.entry_name)) break except Gene.DoesNotExist: self.logger.info("No gene found for {}".format(gene)) continue # if gene name not found, try using entry name if not p: split_entry_name = up['entry_name'].split('_') # add _ to the split entry name to avoid e.g. gp1 matching gp139 entry_name_query = split_entry_name[0] + '_' try: p = Protein.objects.get( entry_name__startswith=entry_name_query, species__id=1) ortholog = True self.logger.info("Human ortholog found: {}".format( p.entry_name)) except Protein.DoesNotExist: self.logger.info( "No match found for {}".format(entry_name_query)) # check whether the entry name is in the construct list if not p and up['entry_name'] in construct_entry_names: # BLAST sequence to find closest hit (for reference positions) blast = BlastSearch() blast_out = blast.run(up['sequence']) # use first hit from BLAST as template for reference positions try: p = Protein.objects.get(pk=blast_out[0][0]) except Protein.DoesNotExist: self.logger.error( 'Template protein for {} not found'.format( up['entry_name'])) # skip if no ortholog is found FIXME use a profile to find a good template if not p: continue # check whether reference positions exist for this protein, and find them if they do not ref_position_file_path = os.sep.join( [self.ref_position_source_dir, up['entry_name'] + '.yaml']) auto_ref_position_file_path = os.sep.join([ self.auto_ref_position_source_dir, up['entry_name'] + '.yaml' ]) if not os.path.isfile(ref_position_file_path): # look for the file in the automatically generated reference file dir if not os.path.isfile(auto_ref_position_file_path): # get reference positions of human ortholog template_ref_position_file_path = os.sep.join( [self.ref_position_source_dir, p.entry_name + '.yaml']) if not os.path.isfile(template_ref_position_file_path): # use a non human sequence template_ref_position_file_path = os.sep.join([ self.auto_ref_position_source_dir, p.entry_name + '.yaml' ]) ref_positions = align_protein_to_reference( up, template_ref_position_file_path, p) # write reference positions to a file with open(auto_ref_position_file_path, "w") as auto_ref_position_file: yaml.dump(ref_positions, auto_ref_position_file, default_flow_style=False) # create a database entry for the protein if ortholog: # for orthologs, use properties from the human protein self.create_protein(p.name, p.family, p.sequence_type, p.residue_numbering_scheme, accession, up) else: # otherwise, create a new family, and use Uniprot name top_level_parent_family = ProteinFamily.objects.get( slug=p.family.slug.split('_')[0]) num_families = ProteinFamily.objects.filter( parent=top_level_parent_family).count() family_slug = top_level_parent_family.slug + "_" + str( num_families + 1).zfill(3) other_family, created = ProteinFamily.objects.get_or_create( parent=top_level_parent_family, name='Other', defaults={'slug': family_slug}) if created: self.logger.info( 'Created protein family {}'.format(other_family)) family_slug += '_001' unclassified_family, created = ProteinFamily.objects.get_or_create( parent=other_family, name='Unclassified', defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format( unclassified_family)) num_families = ProteinFamily.objects.filter( parent=unclassified_family).count() family_slug = unclassified_family.slug + "_" + str( num_families + 1).zfill(3) pf, created = ProteinFamily.objects.get_or_create( parent=unclassified_family, name=up['genes'][0], defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format(pf)) self.create_protein(up['genes'][0], pf, p.sequence_type, p.residue_numbering_scheme, accession, up) self.logger.info('COMPLETED CREATING OTHER PROTEINS')
class GenericNumbering(object): residue_list = [ "ARG", "ASP", "GLU", "HIS", "ASN", "GLN", "LYS", "SER", "THR", "HID", "PHE", "LEU", "ILE", "TYR", "TRP", "VAL", "MET", "PRO", "CYS", "ALA", "GLY" ] def __init__(self, pdb_file=None, pdb_filename=None, structure=None, blast_path='blastp', blastdb=os.sep.join([ settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb' ])): # pdb_file can be either a name/path or a handle to an open file self.pdb_file = pdb_file self.pdb_filename = pdb_filename # dictionary of 'MappedResidue' object storing information about alignments and bw numbers self.residues = {} self.pdb_seq = {} #Seq('') # list of uniprot ids returned from blast self.prot_id_list = [] #setup for local blast search self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb) if self.pdb_file: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_file)[0] elif self.pdb_filename: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure( 'ref', self.pdb_filename)[0] else: self.pdb_structure = structure self.parse_structure(self.pdb_structure) def parse_structure(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ for chain in pdb_struct: self.residues[chain.id] = {} self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": resname = polypeptide.three_to_one('HIS') else: if res.resname not in self.residue_list: continue self.residues[chain.id][res.id[1]] = MappedResidue( res.id[1], polypeptide.three_to_one(res.resname)) self.pdb_seq[chain.id] = ''.join([ self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys()) ]) for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1): self.residues[chain.id][res].pos_in_aln = pos def locate_res_by_pos(self, chain, pos): for res in self.residues[chain].keys(): if self.residues[chain][res].pos_in_aln == pos: return res return 0 def map_blast_seq(self, prot_id, hsps, chain): #find uniprot residue numbers corresponding to those in pdb file q_seq = list(hsps.query) tmp_seq = list(hsps.sbjct) subj_counter = hsps.sbjct_start q_counter = hsps.query_start logger.info("{}\n{}".format(hsps.query, hsps.sbjct)) logger.info("{:d}\t{:d}".format(hsps.query_start, hsps.sbjct_start)) rs = Residue.objects.prefetch_related( 'display_generic_number', 'protein_segment').filter(protein_conformation__protein=prot_id) residues = {} for r in rs: residues[r.sequence_number] = r while tmp_seq: #skipping position if there is a gap in either of sequences if q_seq[0] == '-' or q_seq[0] == 'X' or q_seq[0] == ' ': subj_counter += 1 tmp_seq.pop(0) q_seq.pop(0) continue if tmp_seq[0] == '-' or tmp_seq[0] == 'X' or tmp_seq[0] == ' ': q_counter += 1 tmp_seq.pop(0) q_seq.pop(0) continue if tmp_seq[0] == q_seq[0]: resn = self.locate_res_by_pos(chain, q_counter) if resn != 0: if subj_counter in residues: db_res = residues[subj_counter] if db_res.protein_segment: segment = db_res.protein_segment.slug self.residues[chain][resn].add_segment(segment) if db_res.display_generic_number: num = db_res.display_generic_number.label bw, gpcrdb = num.split('x') gpcrdb = "{}.{}".format(bw.split('.')[0], gpcrdb) self.residues[chain][resn].add_bw_number(bw) self.residues[chain][resn].add_gpcrdb_number( gpcrdb) self.residues[chain][resn].add_gpcrdb_number_id( db_res.display_generic_number.id) self.residues[chain][resn].add_display_number(num) self.residues[chain][resn].add_residue_record( db_res) else: logger.warning( "Could not find residue {} {} in the database.". format(resn, subj_counter)) if prot_id not in self.prot_id_list: self.prot_id_list.append(prot_id) q_counter += 1 subj_counter += 1 tmp_seq.pop(0) q_seq.pop(0) def get_substructure_mapping_dict(self): mapping_dict = {} for chain in self.residues.keys(): for res in self.residues[chain].keys(): if self.residues[chain][res].segment in mapping_dict.keys(): mapping_dict[self.residues[chain][res].segment].append( self.residues[chain][res].number) else: mapping_dict[self.residues[chain][res].segment] = [ self.residues[chain][res].number, ] return mapping_dict def get_annotated_structure(self): for chain in self.pdb_structure: for residue in chain: if residue.id[1] in self.residues[chain.id].keys(): if self.residues[chain.id][residue.id[1]].gpcrdb != 0.: residue["CA"].set_bfactor( float( self.residues[chain.id][residue.id[1]].gpcrdb)) if self.residues[chain.id][residue.id[1]].bw != 0.: residue["N"].set_bfactor( float(self.residues[chain.id][residue.id[1]].bw)) return self.pdb_structure def save_gn_to_pdb(self): #replace bfactor field of CA atoms with b-w numbers and return filehandle with the structure written for chain in self.pdb_structure: for residue in chain: if residue.id[1] in self.residues[chain.id].keys(): if self.residues[chain.id][residue.id[1]].gpcrdb != 0.: residue["CA"].set_bfactor( float( self.residues[chain.id][residue.id[1]].gpcrdb)) if self.residues[chain.id][residue.id[1]].bw != 0.: residue["N"].set_bfactor( float(self.residues[chain.id][residue.id[1]].bw)) r = self.residues[chain.id][residue.id[1]] #get the basename, extension and export the pdb structure with b-w numbers root, ext = os.path.splitext(self.pdb_filename) io = PDBIO() io.set_structure(self.pdb_structure) io.save("%s_GPCRDB%s" % (root, ext)) def assign_generic_numbers(self): alignments = {} #blast search goes first, looping through all the chains for chain in self.pdb_seq.keys(): alignments[chain] = self.blast.run(self.pdb_seq[chain]) #map the results onto pdb sequence for every sequence pair from blast for chain in self.pdb_seq.keys(): for alignment in alignments[chain]: if alignment == []: continue for hsps in alignment[1].hsps: self.map_blast_seq(alignment[0], hsps, chain) return self.get_annotated_structure()
def main_func(self, positions, iteration,count,lock): self.logger.info('CREATING OTHER PROTEINS') try: # go through constructs and finding their entry_names for lookup construct_entry_names = [] self.logger.info('Getting construct accession codes') filenames = os.listdir(self.construct_data_dir) for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) self.logger.info('Getting protein name from construct file {}'.format(source_file)) split_filename = source_file.split(".") extension = split_filename[1] if extension != 'yaml': continue # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # check whether protein is specified if 'protein' not in sd: continue # append entry_name to lookup list construct_entry_names.append(sd['protein']) # parse files filenames = os.listdir(self.local_uniprot_dir) # Keep track of first or second iteration reviewed = ['SWISSPROT','TREMBL'][iteration-1] skipped_due_to_swissprot = 0 # for i,source_file in enumerate(filenames): while count.value<len(filenames): with lock: source_file = filenames[count.value] count.value +=1 # if i<positions[0]: #continue if less than start # continue # if positions[1]: #if end is non-false # if i>=positions[1]: # #continue if i less than process # continue source_file_name = os.sep.join([self.local_uniprot_dir, source_file]) split_filename = source_file.split(".") accession = split_filename[0] extension = split_filename[1] if extension != 'txt': continue up = self.parse_uniprot_file(accession) # Skip TREMBL on first loop, and SWISSPROT on second if reviewed != up['source']: continue # skip human proteins if 'species_latin_name' in up and up['species_latin_name'] == 'H**o sapiens': continue # should proteins that are not constructs be skipped? if self.constructs_only and up['entry_name'] not in construct_entry_names: continue # is this an ortholog of a human protein? ortholog = False # is there already an entry for this protein? try: p = Protein.objects.get(entry_name=up['entry_name']) if "SWISSPROT" == up['source']: pass # print(up['entry_name'], "already there?", accession ) continue except Protein.DoesNotExist: p = None # get human ortholog using gene name for gene in up['genes']: try: g = Gene.objects.get(name__iexact=gene, species__common_name="Human", position=0) ps = g.proteins.all().order_by('id') p = ps[0] ortholog = True self.logger.info("Human ortholog found: {}".format(p.entry_name)) break except Gene.DoesNotExist: self.logger.info("No gene found for {}".format(gene)) continue # if gene name not found, try using entry name if not p: split_entry_name = up['entry_name'].split('_') # add _ to the split entry name to avoid e.g. gp1 matching gp139 entry_name_query = split_entry_name[0] + '_' try: p = Protein.objects.get(entry_name__startswith=entry_name_query, species__common_name="Human") ortholog = True self.logger.info("Human ortholog found: {}".format(p.entry_name)) except Protein.DoesNotExist: self.logger.info("No match found for {}".format(entry_name_query)) # check whether the entry name is in the construct list if not p and up['entry_name'] in construct_entry_names: # BLAST sequence to find closest hit (for reference positions) blast = BlastSearch() blast_out = blast.run(up['sequence']) # use first hit from BLAST as template for reference positions try: p = Protein.objects.get(pk=blast_out[0][0]) except Protein.DoesNotExist: print('Template protein for {} not found'.format(up['entry_name'])) self.logger.error('Template protein for {} not found'.format(up['entry_name'])) # skip if no ortholog is found FIXME use a profile to find a good template if not p: continue # check whether an entry already exists for this protein/species # Skips unreviewed genes that have a matching SWISPROT - Some human orthologues # can have several orthologues from same species. Eg: agtra_rat and agtrb_rat for AGTR1_HUMAN already_entry_names = list(Protein.objects.filter(family=p.family, species__common_name=up['species_common_name'], source__name="SWISSPROT").exclude(entry_name=up['entry_name']).values_list('entry_name', flat = True)) if "SWISSPROT" != up['source'] and len(already_entry_names): # print(up['entry_name'], accession, " swissprot already there?",p.family.slug, p, p.accession ) skipped_due_to_swissprot += 1 continue elif len(already_entry_names): self.logger.error("{} {} swissprot orthologue already there? {}".format(up['entry_name'], accession,already_entry_names)) # # check whether reference positions exist for this protein, and find them if they do not # ref_position_file_path = os.sep.join([self.ref_position_source_dir, up['entry_name'] + '.yaml']) # auto_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, up['entry_name'] + '.yaml']) # if not os.path.isfile(ref_position_file_path): # # look for the file in the automatically generated reference file dir # if not os.path.isfile(auto_ref_position_file_path): # # get reference positions of human ortholog # template_ref_position_file_path = os.sep.join([self.ref_position_source_dir, # p.entry_name + '.yaml']) # if not os.path.isfile(template_ref_position_file_path): # # use a non human sequence # template_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, # p.entry_name + '.yaml']) # ref_positions = align_protein_to_reference(up, template_ref_position_file_path, p) # # write reference positions to a file # with open(auto_ref_position_file_path, "w") as auto_ref_position_file: # yaml.dump(ref_positions, auto_ref_position_file, default_flow_style=False) # create a database entry for the protein if ortholog: # for orthologs, use properties from the human protein self.create_protein(p.name, p.family, p.sequence_type, p.residue_numbering_scheme, accession, up) else: # otherwise, create a new family, and use Uniprot name top_level_parent_family = ProteinFamily.objects.get(slug=p.family.slug.split('_')[0]) num_families = ProteinFamily.objects.filter(parent=top_level_parent_family).count() family_slug = top_level_parent_family.slug + "_" + str(num_families + 1).zfill(3) other_family, created = ProteinFamily.objects.get_or_create(parent=top_level_parent_family, name='Other', defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format(other_family)) family_slug += '_001' unclassified_family, created = ProteinFamily.objects.get_or_create(parent=other_family, name='Unclassified', defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format(unclassified_family)) num_families = ProteinFamily.objects.filter(parent=unclassified_family).count() family_slug = unclassified_family.slug + "_" + str(num_families + 1).zfill(3) pf, created = ProteinFamily.objects.get_or_create(parent=unclassified_family, name=up['genes'][0], defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format(pf)) self.create_protein(up['genes'][0], pf, p.sequence_type, p.residue_numbering_scheme, accession, up) self.logger.info('COMPLETED CREATING OTHER PROTEINS') except Exception as msg: print(msg) self.logger.error(msg) PrintException()
def create_orthologs(self, constructs_only): self.logger.info('CREATING OTHER PROTEINS') # go through constructs and finding their entry_names for lookup construct_entry_names = [] self.logger.info('Getting construct accession codes') filenames = os.listdir(self.construct_data_dir) for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) self.logger.info('Getting protein name from construct file {}'.format(source_file)) split_filename = source_file.split(".") extension = split_filename[1] if extension != 'yaml': continue # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # check whether protein is specified if 'protein' not in sd: continue # append entry_name to lookup list construct_entry_names.append(sd['protein']) # parse files filenames = os.listdir(self.local_uniprot_dir) for source_file in filenames: source_file_name = os.sep.join([self.local_uniprot_dir, source_file]) split_filename = source_file.split(".") accession = split_filename[0] extension = split_filename[1] if extension != 'txt': continue up = self.parse_uniprot_file(accession) # skip human proteins if 'species_latin_name' in up and up['species_latin_name'] == 'H**o sapiens': continue # should proteins that are not constructs be skipped? if constructs_only and up['entry_name'] not in construct_entry_names: continue # is this an ortholog of a human protein? ortholog = False # is there already an entry for this protein? try: p = Protein.objects.get(entry_name=up['entry_name']) continue except Protein.DoesNotExist: p = None # get human ortholog using gene name for gene in up['genes']: try: g = Gene.objects.get(name__iexact=gene, species__id=1, position=0) ps = g.proteins.all().order_by('id') p = ps[0] ortholog = True self.logger.info("Human ortholog found: {}".format(p.entry_name)) break except Gene.DoesNotExist: self.logger.info("No gene found for {}".format(gene)) continue # if gene name not found, try using entry name if not p: split_entry_name = up['entry_name'].split('_') # add _ to the split entry name to avoid e.g. gp1 matching gp139 entry_name_query = split_entry_name[0] + '_' try: p = Protein.objects.get(entry_name__startswith=entry_name_query, species__id=1) ortholog = True self.logger.info("Human ortholog found: {}".format(p.entry_name)) except Protein.DoesNotExist: self.logger.info("No match found for {}".format(entry_name_query)) # check whether the entry name is in the construct list if not p and up['entry_name'] in construct_entry_names: # BLAST sequence to find closest hit (for reference positions) blast = BlastSearch() blast_out = blast.run(up['sequence']) # use first hit from BLAST as template for reference positions try: p = Protein.objects.get(pk=blast_out[0][0]) except Protein.DoesNotExist: self.logger.error('Template protein for {} not found'.format(up['entry_name'])) # skip if no ortholog is found FIXME use a profile to find a good template if not p: continue # check whether reference positions exist for this protein, and find them if they do not ref_position_file_path = os.sep.join([self.ref_position_source_dir, up['entry_name'] + '.yaml']) auto_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, up['entry_name'] + '.yaml']) if not os.path.isfile(ref_position_file_path): # look for the file in the automatically generated reference file dir if not os.path.isfile(auto_ref_position_file_path): # get reference positions of human ortholog template_ref_position_file_path = os.sep.join([self.ref_position_source_dir, p.entry_name + '.yaml']) if not os.path.isfile(template_ref_position_file_path): # use a non human sequence template_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, p.entry_name + '.yaml']) ref_positions = align_protein_to_reference(up, template_ref_position_file_path, p) # write reference positions to a file with open(auto_ref_position_file_path, "w") as auto_ref_position_file: yaml.dump(ref_positions, auto_ref_position_file, default_flow_style=False) # create a database entry for the protein if ortholog: # for orthologs, use properties from the human protein self.create_protein(p.name, p.family, p.sequence_type, p.residue_numbering_scheme, accession, up) else: # otherwise, create a new family, and use Uniprot name top_level_parent_family = ProteinFamily.objects.get(slug=p.family.slug.split('_')[0]) num_families = ProteinFamily.objects.filter(parent=top_level_parent_family).count() family_slug = top_level_parent_family.slug + "_" + str(num_families + 1).zfill(3) other_family, created = ProteinFamily.objects.get_or_create(parent=top_level_parent_family, name='Other', defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format(other_family)) family_slug += '_001' unclassified_family, created = ProteinFamily.objects.get_or_create(parent=other_family, name='Unclassified', defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format(unclassified_family)) num_families = ProteinFamily.objects.filter(parent=unclassified_family).count() family_slug = unclassified_family.slug + "_" + str(num_families + 1).zfill(3) pf, created = ProteinFamily.objects.get_or_create(parent=unclassified_family, name=up['genes'][0], defaults={'slug': family_slug}) if created: self.logger.info('Created protein family {}'.format(pf)) self.create_protein(up['genes'][0], pf, p.sequence_type, p.residue_numbering_scheme, accession, up) self.logger.info('COMPLETED CREATING OTHER PROTEINS')
class GenericNumbering(object): residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR","HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"] exceptions = {'6GDG':[255, 10]} def __init__ (self, pdb_file=None, pdb_filename=None, structure=None, pdb_code=None, blast_path='blastp', blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']),top_results=1, sequence_parser=False, signprot=False): # pdb_file can be either a name/path or a handle to an open file self.pdb_file = pdb_file self.pdb_filename = pdb_filename # if pdb 4 letter code is specified self.pdb_code = pdb_code # dictionary of 'MappedResidue' object storing information about alignments and bw numbers self.residues = {} self.pdb_seq = {} #Seq('') # list of uniprot ids returned from blast self.prot_id_list = [] #setup for local blast search self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb,top_results=top_results) # calling sequence parser if sequence_parser: if pdb_code: struct = Structure.objects.get(pdb_code__index=self.pdb_code) if not signprot: if pdb_code: s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=struct.protein_conformation.protein.parent.id) else: s = SequenceParser(pdb_file=self.pdb_file)#, wt_protein_id=struct.protein_conformation.protein.parent.id) else: s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=signprot.id) self.pdb_structure = s.pdb_struct self.mapping = s.mapping self.wt = s.wt else: if self.pdb_file: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_file)[0] elif self.pdb_filename: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_filename)[0] else: self.pdb_structure = structure self.parse_structure(self.pdb_structure) def parse_structure(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ for chain in pdb_struct: self.residues[chain.id] = {} self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": resname = polypeptide.three_to_one('HIS') else: if res.resname not in self.residue_list: continue self.residues[chain.id][res.id[1]] = MappedResidue(res.id[1], polypeptide.three_to_one(res.resname)) self.pdb_seq[chain.id] = ''.join([self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys())]) for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1): self.residues[chain.id][res].pos_in_aln = pos def locate_res_by_pos (self, chain, pos): for res in self.residues[chain].keys(): if self.residues[chain][res].pos_in_aln == pos: return res return 0 def map_blast_seq (self, prot_id, hsps, chain): #find uniprot residue numbers corresponding to those in pdb file q_seq = list(hsps.query) tmp_seq = list(hsps.sbjct) subj_counter = hsps.sbjct_start q_counter = hsps.query_start logger.info("{}\n{}".format(hsps.query, hsps.sbjct)) logger.info("{:d}\t{:d}".format(hsps.query_start, hsps.sbjct_start)) rs = Residue.objects.prefetch_related('display_generic_number', 'protein_segment').filter( protein_conformation__protein=prot_id) residues = {} for r in rs: residues[r.sequence_number] = r while tmp_seq: #skipping position if there is a gap in either of sequences if q_seq[0] == '-' or q_seq[0] == 'X' or q_seq[0] == ' ': subj_counter += 1 tmp_seq.pop(0) q_seq.pop(0) continue if tmp_seq[0] == '-' or tmp_seq[0] == 'X' or tmp_seq[0] == ' ': q_counter += 1 tmp_seq.pop(0) q_seq.pop(0) continue if tmp_seq[0] == q_seq[0]: resn = self.locate_res_by_pos(chain, q_counter) if resn != 0: if subj_counter in residues: db_res = residues[subj_counter] if db_res.protein_segment: segment = db_res.protein_segment.slug self.residues[chain][resn].add_segment(segment) if db_res.display_generic_number: num = db_res.display_generic_number.label bw, gpcrdb = num.split('x') # Handle non-numerical GNs - still add segment number if not bw[0].isnumeric(): bw[0] = "0" gpcrdb = "{}.{}".format(bw.split('.')[0], gpcrdb) self.residues[chain][resn].add_bw_number(bw) self.residues[chain][resn].add_gpcrdb_number(gpcrdb) self.residues[chain][resn].add_gpcrdb_number_id(db_res.display_generic_number.id) self.residues[chain][resn].add_display_number(num) self.residues[chain][resn].add_residue_record(db_res) else: logger.warning("Could not find residue {} {} in the database.".format(resn, subj_counter)) if prot_id not in self.prot_id_list: self.prot_id_list.append(prot_id) q_counter += 1 subj_counter += 1 tmp_seq.pop(0) q_seq.pop(0) def get_substructure_mapping_dict(self): mapping_dict = {} for chain in self.residues.keys(): for res in self.residues[chain].keys(): if self.residues[chain][res].segment in mapping_dict.keys(): mapping_dict[self.residues[chain][res].segment].append(self.residues[chain][res].number) else: mapping_dict[self.residues[chain][res].segment] = [self.residues[chain][res].number,] return mapping_dict def get_annotated_structure(self): for chain in self.pdb_structure: for residue in chain: if residue.id[1] in self.residues[chain.id].keys(): try: if self.residues[chain.id][residue.id[1]].gpcrdb != 0.: residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb)) if self.residues[chain.id][residue.id[1]].bw != 0.: residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw)) except ValueError: continue return self.pdb_structure def save_gn_to_pdb(self): #replace bfactor field of CA atoms with b-w numbers and return filehandle with the structure written for chain in self.pdb_structure: for residue in chain: if residue.id[1] in self.residues[chain.id].keys(): if self.residues[chain.id][residue.id[1]].gpcrdb != 0.: residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb)) if self.residues[chain.id][residue.id[1]].bw != 0.: residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw)) r = self.residues[chain.id][residue.id[1]] #get the basename, extension and export the pdb structure with b-w numbers root, ext = os.path.splitext(self.pdb_filename) io=PDBIO() io.set_structure(self.pdb_structure) io.save("%s_GPCRDB%s" %(root, ext)) def assign_generic_numbers(self): alignments = {} #blast search goes first, looping through all the chains for chain in self.pdb_seq.keys(): alignments[chain] = self.blast.run(self.pdb_seq[chain]) #map the results onto pdb sequence for every sequence pair from blast for chain in self.pdb_seq.keys(): for alignment in alignments[chain]: if alignment == []: continue for hsps in alignment[1].hsps: self.map_blast_seq(alignment[0], hsps, chain) return self.get_annotated_structure() def assign_generic_numbers_with_sequence_parser(self): for chain in self.pdb_structure: for residue in chain: if chain.id in self.mapping: if residue.id[1] in self.mapping[chain.id].keys(): gpcrdb_num = self.mapping[chain.id][residue.id[1]].gpcrdb if gpcrdb_num != '' and len(gpcrdb_num.split('x'))==2: bw, gn = gpcrdb_num.split('x') gn = "{}.{}".format(bw.split('.')[0], gn) if len(gn.split('.')[1])==3: gn = '-'+gn[:-1] try: residue["CA"].set_bfactor(float(gn)) residue["N"].set_bfactor(float(bw)) except: pass return self.pdb_structure def assign_cgn_with_sequence_parser(self, target_chain): pdb_array = OrderedDict() for s in G_PROTEIN_SEGMENTS['Full']: pdb_array[s] = OrderedDict() i, j = 0, 0 key_list = [i.gpcrdb for i in list(self.mapping[target_chain].values())] for key, vals in self.mapping[target_chain].items(): category, segment, num = vals.gpcrdb.split('.') if self.pdb_code in self.exceptions: try: if self.pdb_structure[target_chain][key].get_id()[1]>=self.exceptions[self.pdb_code][0]: if i<self.exceptions[self.pdb_code][1]: pdb_array[segment][vals.gpcrdb] = 'x' i+=1 continue except: pass this_cat, this_seg, this_num = key_list[j].split('.') try: pdb_array[segment][vals.gpcrdb] = self.pdb_structure[target_chain][key-i].get_list() except: pdb_array[segment][vals.gpcrdb] = 'x' j+=1 return pdb_array
class SequenceParser(object): """ Class mapping the pdb, pdb_seqres, wildtype and any given sequence onto wt using blast with human sequences database. It produces a report with missing, mutated and inserted residues. """ residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR", "HIS", "HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"] def __init__(self, pdb_file=None, sequence=None, wt_protein_id=None): # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers self.mapping = {} self.residues = {} self.segments = {} self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb'])) self.wt_protein_id = wt_protein_id if pdb_file is not None: self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0] # a list of SeqRecord objects retrived from the pdb SEQRES section try: self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres')) self.struct_id = self.seqres[0].id.split(':')[0] except: self.seqres = None self.struct_id = None # SeqRecord id is a pdb_code:chain self.sequence = sequence if type(sequence) == "string": self.sequence = { x: y for x,y in enumerate(sequnece) } # If not specified, attempt to get wildtype from pdb. try: if not wt_protein_id and pdb_file is not None: self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent else: raise Exception() except: if not wt_protein_id: self.wt = None self.wt_seq = '' else: self.wt = Protein.objects.get(id=wt_protein_id) self.wt_seq = str(self.wt.sequence) self.fusions = [] self.parse_pdb(self.pdb_struct) #if self.seqres: # self.map_seqres() self.mark_deletions() def parse_pdb(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ for chain in pdb_struct: self.residues[chain.id] = [] for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname.replace('HID', 'HIS') not in self.residue_list: continue self.residues[chain.id].append(res) poly = self.get_chain_peptides(chain.id) for peptide in poly: #print("Start: {} Stop: {} Len: {}".format(peptide[0].id[1], peptide[-1].id[1], len(peptide))) self.map_to_wt_blast(chain.id, peptide, None, int(peptide[0].id[1])) def get_segments(self): #get the first chain c = list(self.mapping.keys())[0] for segment in ProteinSegment.objects.all(): resi = [] for r in Residue.objects.filter(protein_conformation__protein=self.wt.id, protein_segment=segment): if self.mapping[c][r.sequence_number].resnum is not None: resi.append(self.mapping[c][r.sequence_number].resnum) if resi == []: continue self.segments[segment.slug] = [min(resi), max(resi)] return self.segments def get_chain_peptides(self, chain_id, gap_threshold=230): """ Get peptides of sequential residue numbers (with respect to 230 aa gaps). The maximum length of ICL3 is 230 aa, and fusion proteins usualy have significantly different numbers, i.e. exceeding the 230 gap between TM5 and 6. The maximum allowed gap size can be evaluated automaticaly, but it is fairly costly: max([len(Residue.objects.filter(protein_segment=11, protein_conformation__protein=x)) for x in Protein.objects.filter(species=1)]) """ rnumbers = [int(x.id[1]) for x in self.residues[chain_id]] last_idx = len(rnumbers)-1 peptides = [] tmp = [] for i, rnum in enumerate(rnumbers): if i == last_idx: #FIXME: Assuming that very last residue is actualy continuation of a chain tmp.append(self.residues[chain_id][i]) peptides.append(tmp) break if rnumbers[i+1] != rnum+1 and abs(rnum+1 - rnumbers[i+1]) > gap_threshold: tmp.append(self.residues[chain_id][i]) peptides.append(tmp) tmp = [] else: tmp.append(self.residues[chain_id][i]) return peptides def get_chain_sequence(self, chain): """ Returns a sequence string of a given chain. """ return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in self.residues[chain] if x.resname in self.residue_list]) def get_peptide_sequence(self, residues): """ Returns a sequence string of a given list of Bio.PDB.Residue objects. """ return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in residues if x.resname in self.residue_list]) def find_nonredundant_chains(self): """ Returns a list of nonidentical chains. """ nrc = [] if len(self.mapping.keys()) == 1: return self.mapping.keys() for r_chain in self.mapping.keys(): for chain in self.mapping.keys(): if r_chain == chain: continue if self.mapping[r_chain] != self.mapping[chain]: nrc.append(r_chain) return nrc def map_to_wt_blast(self, chain_id, residues = None, sequence=None, starting_aa = 1, seqres = False): if residues: seq = self.get_peptide_sequence(residues) elif sequence: seq = sequence else: seq = self.get_chain_sequence(chain_id) alignments = self.blast.run(seq) if self.wt_protein_id!=None: self.wt = Protein.objects.get(id=self.wt_protein_id) else: self.wt = None for alignment in alignments: if self.wt==None: try: self.wt = Protein.objects.get(entry_name=str(alignment[1].hit_def)) wt_resi = list(Residue.objects.filter(protein_conformation__protein=self.wt.id)) self.mapping[chain_id] = {x.sequence_number: ParsedResidue(x.amino_acid, x.sequence_number, str(x.display_generic_number) if x.display_generic_number else None, x.protein_segment) for x in wt_resi} except: pass else: wt_resi = list(Residue.objects.filter(protein_conformation__protein=self.wt.id)) self.mapping[chain_id] = {x.sequence_number: ParsedResidue(x.amino_acid, x.sequence_number, str(x.display_generic_number) if x.display_generic_number else None, x.protein_segment) for x in wt_resi} if alignment[1].hsps[0].expect > .5 and residues: # self.fusions.append(AuxProtein(residues)) #The case when auxiliary protein is in a separate chain if self.get_chain_sequence(chain_id) == self.get_peptide_sequence(residues) and chain_id in self.mapping: del self.mapping[chain_id] continue if self.wt.id != int(alignment[0]): continue for hsps in alignment[1].hsps: self.map_hsps(hsps, chain_id, starting_aa, seqres) def map_hsps(self, hsps, chain_id, offset = 1, seqres = False): """ Analyzes the High Similarity Protein Segment. """ q = hsps.query sbjct = hsps.sbjct sbjct_counter = hsps.sbjct_start q_counter = hsps.query_start for s, q in zip(sbjct, q): if s == q: if seqres: self.mapping[chain_id][sbjct_counter].set_seqres(True) else: self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter) sbjct_counter += 1 q_counter += 1 elif s != '-' and q != '-': self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter) self.mapping[chain_id][sbjct_counter].set_mutation(q) sbjct_counter += 1 q_counter += 1 elif s == '-' and q != '-': self.mapping[chain_id][offset - 1 + q_counter].set_insertion(q) sbjct_counter += 1 q_counter += 1 elif s != '-' and q == '-': self.mapping[chain_id][sbjct_counter].set_deletion() sbjct_counter += 1 q_counter += 1 def map_to_wt_pw(self, chain_id, residues = None, sequence=None, starting_aa = 1): """ @param sequence: a dictionary of residue number: residue one letter code pairs """ if residues: seq = self.get_chain_sequence(residues) elif sequence: seq = sequence.values() else: return wt, chain_seq, score, start, end = pairwise2.align.localms(self.wt_seq, seq, 2, -4, -4, -.1, one_alignment_only=True)[0] offset = 0 for w, c in zip(wt, chain_seq): if w == c: if seqres: self.mapping[chain.id][starting_aa + offset].seqres=True r = Residue.objects.get(sequence_number=offset+self.wt_seq_start, protein_conformation__protein=self.wt.id) if r.display_generic_number is not None: self.mapping[chain_id][starting_aa + offset].add_gpcrdb(r.display_generic_number) offset += 1 elif c == '-' and w != '-': self.mapping[chain_id][starting_aa + offset].add_deletion() elif w != '-' and c != '-' and w != c: self.mapping[chain_id][starting_aa + offset].add_mutation(c) offset += 1 elif w == '-' and c != '-': self.mapping[chain_id][starting_aa + offset].add_insertion(c) offset += 1 def map_seqres(self): for sr in self.seqres: self.map_to_wt_blast(sr.annotations['chain'], sequence=sr.seq, seqres=True) def mark_deletions(self): for chain in self.mapping.keys(): for num, res in self.mapping[chain].items(): if res.resnum is None: res.set_deletion() def get_mapping_dict(self, pdb_keys=False, seqres=False): if pdb_keys: return {x: {y: self.mapping[x][y].seqres if seqres else self.mapping[x][y].resnum for y in self.mapping[x].keys()} for x in self.mapping.keys()} else: if seqres: return {x: {y: self.mapping[x][y].resnum if self.mapping[x][y].seqres else '-' for y in self.mapping[x].keys()} for x in self.mapping.keys()} else: return {x: {y: self.mapping[x][y].resnum for y in self.mapping[x].keys()} for x in self.mapping.keys()} def get_fusions(self): if self.fusions == []: return {} fusion_dict = OrderedDict({"auxiliary": {}}) count = 1 for fusion in self.fusions: fusion_dict["auxiliary"]["aux{}".format(count)] = fusion.get_info() return fusion_dict def get_deletions(self): deletions_list = [] for chain in self.find_nonredundant_chains(): deletions = [x for x,y in self.mapping[chain].items() if y.deletion] deletion = deletions.reverse() tmp = [] #for num, res in self.mapping[chain].items(): # if res.deletion: # tmp.append(num) first = 0 prev = 0 while deletions != []: x = deletions.pop() #print("{}\t{}\t{}".format(x, first, prev)) if first == 0: tmp.append(x) first = x continue if prev == 0: tmp.append(x) prev = x continue if abs(x - prev) == 1: tmp.append(x) prev = x else: deletions_list.append(OrderedDict({ "start" : min(tmp), "end" : max(tmp), "type" : "single" if len(tmp) == 1 else "range", "chain" : chain })) tmp = [x] first = x prev = x deletions_list.append(OrderedDict({ "start" : min(tmp), "end" : max(tmp), "type" : "single" if len(tmp) == 1 else "range", "chain" : chain })) return {"deletions" : deletions_list} def get_mutations(self): mutations_list = [] for chain in self.find_nonredundant_chains(): for num, res in self.mapping[chain].items(): if res.mutation: mutations_list.append(OrderedDict({ "wt" : res.name, "mut" : res.mutation, "pos (wt)" : num, "pos (pdb)" : res.resnum, "chain" : chain })) return {"mutations" : mutations_list } def get_report(self): for chain in sorted(self.mapping.keys()): print("Chain {}".format(chain)) for res in sorted(self.mapping[chain].keys()): print(self.mapping[chain][res]) def save_excel_report(self, file_name): workbook = xlsxwriter.Workbook(file_name) for chain in sorted(self.mapping.keys()): worksheet = workbook.add_worksheet(chain) worksheet.write_row(0,0,["Protein number", "Residue name", "PDB number", "Generic number", "Mutation", "SEQRES"]) row_id = 1 for res in sorted(self.mapping[chain].keys()): tmp = self.mapping[chain][res] worksheet.write_row(row_id, 0, tmp.get_param_list()) row_id += 1 workbook.close()
class GenericNumbering(object): residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR","HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"] def __init__ (self, pdb_file=None, pdb_filename=None, structure=None, blast_path='blastp', blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']),top_results=1): # pdb_file can be either a name/path or a handle to an open file self.pdb_file = pdb_file self.pdb_filename = pdb_filename # dictionary of 'MappedResidue' object storing information about alignments and bw numbers self.residues = {} self.pdb_seq = {} #Seq('') # list of uniprot ids returned from blast self.prot_id_list = [] #setup for local blast search self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb,top_results=top_results) if self.pdb_file: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_file)[0] elif self.pdb_filename: self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_filename)[0] else: self.pdb_structure = structure self.parse_structure(self.pdb_structure) def parse_structure(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ for chain in pdb_struct: self.residues[chain.id] = {} self.pdb_seq[chain.id] = Seq('') for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname == "HID": resname = polypeptide.three_to_one('HIS') else: if res.resname not in self.residue_list: continue self.residues[chain.id][res.id[1]] = MappedResidue(res.id[1], polypeptide.three_to_one(res.resname)) self.pdb_seq[chain.id] = ''.join([self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys())]) for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1): self.residues[chain.id][res].pos_in_aln = pos def locate_res_by_pos (self, chain, pos): for res in self.residues[chain].keys(): if self.residues[chain][res].pos_in_aln == pos: return res return 0 def map_blast_seq (self, prot_id, hsps, chain): #find uniprot residue numbers corresponding to those in pdb file q_seq = list(hsps.query) tmp_seq = list(hsps.sbjct) subj_counter = hsps.sbjct_start q_counter = hsps.query_start logger.info("{}\n{}".format(hsps.query, hsps.sbjct)) logger.info("{:d}\t{:d}".format(hsps.query_start, hsps.sbjct_start)) rs = Residue.objects.prefetch_related('display_generic_number', 'protein_segment').filter( protein_conformation__protein=prot_id) residues = {} for r in rs: residues[r.sequence_number] = r while tmp_seq: #skipping position if there is a gap in either of sequences if q_seq[0] == '-' or q_seq[0] == 'X' or q_seq[0] == ' ': subj_counter += 1 tmp_seq.pop(0) q_seq.pop(0) continue if tmp_seq[0] == '-' or tmp_seq[0] == 'X' or tmp_seq[0] == ' ': q_counter += 1 tmp_seq.pop(0) q_seq.pop(0) continue if tmp_seq[0] == q_seq[0]: resn = self.locate_res_by_pos(chain, q_counter) if resn != 0: if subj_counter in residues: db_res = residues[subj_counter] if db_res.protein_segment: segment = db_res.protein_segment.slug self.residues[chain][resn].add_segment(segment) if db_res.display_generic_number: num = db_res.display_generic_number.label bw, gpcrdb = num.split('x') gpcrdb = "{}.{}".format(bw.split('.')[0], gpcrdb) self.residues[chain][resn].add_bw_number(bw) self.residues[chain][resn].add_gpcrdb_number(gpcrdb) self.residues[chain][resn].add_gpcrdb_number_id(db_res.display_generic_number.id) self.residues[chain][resn].add_display_number(num) self.residues[chain][resn].add_residue_record(db_res) else: logger.warning("Could not find residue {} {} in the database.".format(resn, subj_counter)) if prot_id not in self.prot_id_list: self.prot_id_list.append(prot_id) q_counter += 1 subj_counter += 1 tmp_seq.pop(0) q_seq.pop(0) def get_substructure_mapping_dict(self): mapping_dict = {} for chain in self.residues.keys(): for res in self.residues[chain].keys(): if self.residues[chain][res].segment in mapping_dict.keys(): mapping_dict[self.residues[chain][res].segment].append(self.residues[chain][res].number) else: mapping_dict[self.residues[chain][res].segment] = [self.residues[chain][res].number,] return mapping_dict def get_annotated_structure(self): for chain in self.pdb_structure: for residue in chain: if residue.id[1] in self.residues[chain.id].keys(): if self.residues[chain.id][residue.id[1]].gpcrdb != 0.: residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb)) if self.residues[chain.id][residue.id[1]].bw != 0.: residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw)) return self.pdb_structure def save_gn_to_pdb(self): #replace bfactor field of CA atoms with b-w numbers and return filehandle with the structure written for chain in self.pdb_structure: for residue in chain: if residue.id[1] in self.residues[chain.id].keys(): if self.residues[chain.id][residue.id[1]].gpcrdb != 0.: residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb)) if self.residues[chain.id][residue.id[1]].bw != 0.: residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw)) r = self.residues[chain.id][residue.id[1]] #get the basename, extension and export the pdb structure with b-w numbers root, ext = os.path.splitext(self.pdb_filename) io=PDBIO() io.set_structure(self.pdb_structure) io.save("%s_GPCRDB%s" %(root, ext)) def assign_generic_numbers(self): alignments = {} #blast search goes first, looping through all the chains for chain in self.pdb_seq.keys(): alignments[chain] = self.blast.run(self.pdb_seq[chain]) #map the results onto pdb sequence for every sequence pair from blast for chain in self.pdb_seq.keys(): for alignment in alignments[chain]: if alignment == []: continue for hsps in alignment[1].hsps: self.map_blast_seq(alignment[0], hsps, chain) return self.get_annotated_structure()
class SequenceParser(object): """ Class mapping the pdb, pdb_seqres, wildtype and any given sequence onto wt using blast with human sequences database. It produces a report with missing, mutated and inserted residues. """ residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR", "HIS", "HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"] def __init__(self, pdb_file, sequence=None, wt_protein_id=None): # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers self.mapping = {} self.residues = {} self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb'])) self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0] # a list of SeqRecord objects retrived from the pdb SEQRES section self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres')) # SeqRecord id is a pdb_code:chain self.struct_id = self.seqres[0].id.split(':')[0] # If not specified, attempt to get wildtype from pdb. if not wt_protein_id: self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent else: self.wt = Protein.objects.get(id=wt_protein_id) self.wt_seq = str(self.wt.sequence) self.fusions = [] self.parse_pdb(self.pdb_struct) def parse_pdb(self, pdb_struct): """ extracting sequence and preparing dictionary of residues bio.pdb reads pdb in the following cascade: model->chain->residue->atom """ wt_resi = list(Residue.objects.filter(protein_conformation__protein=self.wt.id)) for chain in pdb_struct: self.residues[chain.id] = [] self.mapping[chain.id] = {x.sequence_number: ParsedResidue(x.amino_acid, x.sequence_number, str(x.display_generic_number) if x.display_generic_number else None) for x in wt_resi} for res in chain: #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code) if res.resname.replace('HID', 'HIS') not in self.residue_list: continue self.residues[chain.id].append(res) #self.mapping[chain.id][res.id[1]] = ParsedResidue(polypeptide.three_to_one(res.resname.replace('HID', 'HIS')), res.id[1]) def get_chain_peptides(self, chain_id, gap_threshold=230): """ Get peptides of sequential residue numbers (with respect to 230 aa gaps). The maximum length of ICL3 is 230 aa, and fusion proteins usualy have significantly different numbers, i.e. exceeding the 230 gap between TM5 and 6. The maximum allowed gap size can be evaluated automaticaly, but it is fairly costly: max([len(Residue.objects.filter(protein_segment=11, protein_conformation__protein=x)) for x in Protein.objects.filter(species=1)]) """ rnumbers = [int(x.id[1]) for x in self.residues[chain_id]] last_idx = len(rnumbers)-1 peptides = [] tmp = [] for i, rnum in enumerate(rnumbers): if i == last_idx: #FIXME: Assuming that very last residue is actualy continuation of a chain tmp.append(self.residues[chain_id][i]) peptides.append(tmp) break if rnumbers[i+1] != rnum+1 and abs(rnum+1 - rnumbers[i+1]) > gap_threshold: tmp.append(self.residues[chain_id][i]) peptides.append(tmp) tmp = [] else: tmp.append(self.residues[chain_id][i]) return peptides def get_chain_sequence(self, chain): return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in chain if x.resname in self.residue_list]) def map_to_wt_blast(self, chain_id, residues = None, sequence=None, starting_aa = 1, seqres = False): if residues: seq = self.get_chain_sequence(residues) elif sequence: seq = sequence else: return alignments = self.blast.run(seq) for alignment in alignments: if alignment[1].hsps[0].expect > 1. and residues: self.fusions.append(residues) #for res in residues: # self.mapping[chain_id][res.id[1]].set_fusion() if self.wt.id != int(alignment[0]): continue for hsps in alignment[1].hsps: self.map_hsps(hsps, chain_id, starting_aa, seqres) def map_hsps(self, hsps, chain_id, offset = 1, seqres = False): """ Analyzes the High Similarity Protein Segment. """ q = hsps.query sbjct = hsps.sbjct sbjct_counter = hsps.sbjct_start q_counter = hsps.query_start for s, q in zip(sbjct, q): if s == q: #r = Residue.objects.get(sequence_number=sbjct_counter, protein_conformation__protein=self.wt.id) #if r.display_generic_number is not None: # self.mapping[chain_id][offset + q_counter].set_gpcrdb(r.display_generic_number) #self.mapping[chain_id][offset - 1 + q_counter].set_wt_number(sbjct_counter) if seqres: self.mapping[chain_id][sbjct_counter].set_seqres(True) else: self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter) sbjct_counter += 1 q_counter += 1 elif s != '-' and q != '-': #print(s) #self.mapping[chain_id][offset - 1 + q_counter].set_mutation(s) #self.mapping[chain_id][offset - 1 + q_counter].set_wt_number(sbjct_counter) self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter) self.mapping[chain_id][sbjct_counter].set_mutation(q) sbjct_counter += 1 q_counter += 1 elif s == '-' and q != '-': self.mapping[chain_id][offset - 1 + q_counter].set_insertion(q) q_counter += 1 def map_to_wt_pw(self, chain_id, residues = None, sequence=None, starting_aa = 1): if residues: seq = self.get_chain_sequence(residues) elif sequence: seq = sequence else: return wt, chain_seq, score, start, end = pairwise2.align.localms(self.wt_seq, seq, 2, -4, -4, -.1, one_alignment_only=True)[0] offset = 0 for w, c in zip(wt, chain_seq): if w == c: if seqres: self.mapping[chain.id][starting_aa + offset].seqres=True r = Residue.objects.get(sequence_number=offset+self.wt_seq_start, protein_conformation__protein=self.wt.id) if r.display_generic_number is not None: self.mapping[chain_id][starting_aa + offset].add_gpcrdb(r.display_generic_number) offset += 1 elif c == '-' and w != '-': print(offset) self.mapping[chain_id][starting_aa + offset].add_deletion() elif w != '-' and c != '-' and w != c: self.mapping[chain_id][starting_aa + offset].add_mutation(c) offset += 1 elif w == '-' and c != '-': self.mapping[chain_id][starting_aa + offset].add_insertion(c) offset += 1 def map_seqres(self): for sr in self.seqres: self.map_to_wt_blast(sr.annotations['chain'], sequence=sr.seq, seqres=True) def get_report(self): for chain in sorted(self.mapping.keys()): print("Chain {}".format(chain)) for res in sorted(self.mapping[chain].keys()): print(self.mapping[chain][res]) def save_excel_report(self, file_name): workbook = xlsxwriter.Workbook(file_name) for chain in sorted(self.mapping.keys()): worksheet = workbook.add_worksheet(chain) worksheet.write_row(0,0,["Protein number", "Residue name", "PDB number", "Generic number", "Mutation", "SEQRES"]) row_id = 1 for res in sorted(self.mapping[chain].keys()): tmp = self.mapping[chain][res] worksheet.write_row(row_id, 0, tmp.get_param_list()) row_id += 1 workbook.close()