def get(self, request, slug=None, segments=None): if slug is not None: ps = Protein.objects.filter(sequence_type__slug='wt', source__id=1, family__slug__startswith=slug) if segments is not None: segment_list = segments.split(",") ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False) # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_proteins(ps) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', {'a': a}).split("\n") # convert the list to a dict ali_dict = {} for row in response: if row.startswith(">"): k = row[1:] else: ali_dict[k] = row k = False return Response(ali_dict)
def get(self, request, proteins=None, segments=None): if proteins is not None: protein_list = proteins.split(",") ps = Protein.objects.filter(sequence_type__slug='wt', entry_name__in=protein_list) if segments is not None: segment_list = segments.split(",") ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False) # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_proteins(ps) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', { 'a': a }).split("\n") # convert the list to a dict ali_dict = {} k = False for row in response: if row.startswith(">"): k = row[1:] elif k: ali_dict[k] = row k = False return Response(ali_dict)
def get(self, request, proteins=None, segments=None): if proteins is not None: protein_list = proteins.split(",") ps = Protein.objects.filter( entry_name__in=protein_list) if segments is not None: segment_list = segments.split(",") ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False) # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_proteins(ps) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # render the fasta template as string #response = render_to_string('alignment/alignment_fasta.html', {'a': a}).split("\n") # convert the list to a dict ali_dict = {} k = False num_of_sequences = len(a.proteins) num_residue_columns = len(a.positions) + len(a.segments) renderer_classes = (StaticHTMLRenderer,) response = render(request, 'alignment/alignment_ws.html', {'a': a, 'num_of_sequences': num_of_sequences,'num_residue_columns': num_residue_columns}) response['X-Frame-Options'] = "ALLOWALL" # for row in response: # if row.startswith(">"): # k = row[1:] # elif k: # ali_dict[k] = row # k = False #return Response(ali_dict) return response
def get(self, request, proteins=None, segments=None, statistics=False): if proteins is not None: protein_list = proteins.split(",") ps = Protein.objects.filter(sequence_type__slug='wt', entry_name__in=protein_list) # take the numbering scheme from the first protein s_slug = Protein.objects.get( entry_name=protein_list[0]).residue_numbering_scheme_id gen_list = [] segment_list = [] if segments is not None: input_list = segments.split(",") # fetch a list of all segments protein_segments = ProteinSegment.objects.filter( partial=False).values_list('slug', flat=True) for s in input_list: # add to segment list if s in protein_segments: segment_list.append(s) # get generic numbering object for generic positions else: gen_object = ResidueGenericNumberEquivalent.objects.get( label=s, scheme__id=s_slug) gen_object.properties = {} gen_list.append(gen_object) # fetch all complete protein_segments ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False) # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_proteins(ps) # load generic numbers and TMs seperately if gen_list: a.load_segments(gen_list) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # calculate statistics if statistics == True: a.calculate_statistics() # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', { 'a': a }).split("\n") # convert the list to a dict ali_dict = {} k = False for row in response: if row.startswith(">"): k = row[1:] elif k: ali_dict[k] = row k = False # render statistics for output if statistics == True: feat = {} for i, feature in enumerate(AMINO_ACID_GROUPS): feature_stats = a.feature_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append( sub_list) # remove feature frequencies # print(feature_stats_clean) feat[feature] = [ item for sublist in feature_stats_clean for item in sublist ] for i, AA in enumerate(AMINO_ACIDS): feature_stats = a.amino_acid_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append( sub_list) # remove feature frequencies # print(feature_stats_clean) feat[AA] = [ item for sublist in feature_stats_clean for item in sublist ] ali_dict["statistics"] = feat return Response(ali_dict)
def get(self, request, entry_name=None, segments=None): if entry_name is not None: ref = Protein.objects.get(sequence_type__slug='wt', entry_name=entry_name) structures = Structure.objects.order_by( 'protein_conformation__protein__parent', 'state', 'resolution').distinct('protein_conformation__protein__parent', 'state') ps = [] for structure in structures: ps.append(structure.protein_conformation.protein.parent) if segments is not None: input_list = segments.split(",") ss = ProteinSegment.objects.filter(slug__in=input_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False, category='helix') # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_reference_protein(ref) a.load_proteins(ps) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # calculate identity and similarity of each row compared to the reference a.calculate_similarity() # return the entry_name of the closest template return Response(a.proteins[1].protein.entry_name)
def find_segment_template(self, pconf, sconfs, segments): a = Alignment() a.load_reference_protein(pconf.protein) a.load_proteins(sconfs) a.load_segments(segments) a.build_alignment() a.calculate_similarity() return a.proteins[1]
def get(self, request, proteins=None, segments=None): if proteins is not None: protein_list = proteins.split(",") # first in API should be reference ps = Protein.objects.filter(sequence_type__slug='wt', entry_name__in=protein_list[1:]) reference = Protein.objects.filter( sequence_type__slug='wt', entry_name__in=[protein_list[0]]) # take the numbering scheme from the first protein s_slug = Protein.objects.get( entry_name=protein_list[0]).residue_numbering_scheme_id if segments is not None: input_list = segments.split(",") # fetch a list of all segments protein_segments = ProteinSegment.objects.filter( partial=False).values_list('slug', flat=True) gen_list = [] segment_list = [] for s in input_list: # add to segment list if s in protein_segments: segment_list.append(s) # get generic numbering object for generic positions else: # make sure the query works for all positions gen_object = ResidueGenericNumberEquivalent.objects.get( label=s, scheme__id=s_slug) gen_object.properties = {} gen_list.append(gen_object) # fetch all complete protein_segments ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) # create an alignment object a = Alignment() a.show_padding = False # load data from API into the alignment a.load_reference_protein(reference) a.load_proteins(ps) # load generic numbers and TMs seperately a.load_segments(gen_list) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # calculate identity and similarity of each row compared to the reference a.calculate_similarity() # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', { 'a': a }).split("\n") # convert the list to a dict ali_dict = {} k = False num = 0 for i, row in enumerate(response): if row.startswith(">"): k = row[1:] elif k: # add the query as 100 identical/similar to the beginning (like on the website) if num == 0: a.proteins[num].identity = 100 a.proteins[num].similarity = 100 # order dict after custom list keyorder = ["similarity", "identity", "AA"] ali_dict[k] = { "AA": row, "identity": int(str(a.proteins[num].identity).replace(" ", "")), "similarity": int(str(a.proteins[num].similarity).replace(" ", "")) } ali_dict[k] = OrderedDict( sorted(ali_dict[k].items(), key=lambda t: keyorder.index(t[0]))) num += 1 k = False ali_dict_ordered = OrderedDict( sorted(ali_dict.items(), key=lambda x: x[1]['similarity'], reverse=True)) return Response(ali_dict_ordered)
def get(self, request, entry_name=None, segments=None): if entry_name is not None: ref = Protein.objects.get(sequence_type__slug='wt', entry_name=entry_name) structures = Structure.objects.order_by('protein_conformation__protein__parent', 'state', 'resolution').distinct('protein_conformation__protein__parent', 'state') ps = [] for structure in structures: ps.append(structure.protein_conformation.protein.parent) if segments is not None: input_list = segments.split(",") ss = ProteinSegment.objects.filter(slug__in=input_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False, category='helix') # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_reference_protein(ref) a.load_proteins(ps) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # calculate identity and similarity of each row compared to the reference a.calculate_similarity() # return the entry_name of the closest template return Response(a.proteins[1].protein.entry_name)
def get_segment_template (protein, segments=['TM1', 'TM2', 'TM3', 'TM4','TM5','TM6', 'TM7'], state=None): a = Alignment() a.load_reference_protein(protein) #You are so gonna love it... if state: a.load_proteins([x.protein_conformation.protein.parent for x in list(Structure.objects.order_by('protein_conformation__protein__parent','resolution').exclude(protein_conformation__protein=protein.id, protein_conformation__state=state))]) else: a.load_proteins([x.protein_conformation.protein.parent for x in list(Structure.objects.order_by('protein_conformation__protein__parent','resolution').exclude(protein_conformation__protein=protein.id))]) a.load_segments(ProteinSegment.objects.filter(slug__in=segments)) a.build_alignment() a.calculate_similarity() return a.proteins[1]
def receptor_mammal_representatives(self): # print('Script to label structures if they are mammal, and which are the closest structure to human') structures = Structure.objects.filter(refined=False).prefetch_related( "pdb_code", "state", "protein_conformation__protein__parent__family", "protein_conformation__protein__species") distinct_proteins = {} is_mammal = {} ## Go through all structures and deduce if mammal and prepare receptor/state sets to find most "human" for s in structures: pdb = s.pdb_code.index state = s.state.slug slug = s.protein_conformation.protein.parent.family.slug name = s.protein_conformation.protein.parent.family.name species = s.protein_conformation.protein.species.common_name protein = s.protein_conformation.protein.parent if species not in is_mammal: mammal = self.check_uniprot_if_mammal(protein) is_mammal[species] = mammal else: mammal = is_mammal[species] # print(species, mammal) s.mammal = mammal s.save() key = '{}-{}'.format(slug, state) if key not in distinct_proteins: distinct_proteins[key] = [] distinct_proteins[key].append([pdb, species, protein, s]) print("DEBUG", is_mammal) for conformation, pdbs in distinct_proteins.items(): p_slug, state = conformation.split("-") number_of_pdbs = len(pdbs) distinct_species = set(list(x[1] for x in pdbs)) distinct_proteins = set(list(x[2] for x in pdbs)) if 'Human' in distinct_species: # Human always best.. best_species = 'Human' elif len(distinct_species) == 1: # If only one type.. then it most be the best match best_species = list(distinct_species)[0] else: # There are more than 1 species, and human is not in it.. do similarity a = Alignment() ref_p = Protein.objects.get(family__slug=p_slug, species__common_name='Human', sequence_type__slug='wt') a.load_reference_protein( Protein.objects.get(family__slug=p_slug, species__common_name='Human', sequence_type__slug='wt')) a.load_proteins(distinct_proteins) a.load_segments( ProteinSegment.objects.filter(slug__in=[ 'TM1', 'TM2', 'TM3', 'TM4', 'TM5', 'TM6', 'TM7' ])) a.build_alignment() a.calculate_similarity() best_species = a.proteins[1].protein.species.common_name ## Now that we know which species to label as most "human" go through structures and label for pdb, species, protein, structure in pdbs: most_human = False if species == best_species: most_human = True structure.closest_to_human = most_human structure.save()
def get(self, request, proteins=None, segments=None, statistics=False): if proteins is not None: protein_list = proteins.split(",") ps = Protein.objects.filter(sequence_type__slug='wt', entry_name__in=protein_list) # take the numbering scheme from the first protein s_slug = Protein.objects.get(entry_name=protein_list[0]).residue_numbering_scheme_id gen_list = [] segment_list = [] if segments is not None: input_list = segments.split(",") # fetch a list of all segments protein_segments = ProteinSegment.objects.filter(partial=False).values_list('slug', flat=True) for s in input_list: # add to segment list if s in protein_segments: segment_list.append(s) # get generic numbering object for generic positions else: gen_object = ResidueGenericNumberEquivalent.objects.get(label=s, scheme__id=s_slug) gen_object.properties = {} gen_list.append(gen_object) # fetch all complete protein_segments ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False) # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_proteins(ps) # load generic numbers and TMs seperately if gen_list: a.load_segments(gen_list) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # calculate statistics if statistics == True: a.calculate_statistics() # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', {'a': a}).split("\n") # convert the list to a dict ali_dict = {} k = False for row in response: if row.startswith(">"): k = row[1:] elif k: ali_dict[k] = row k = False # render statistics for output if statistics == True: feat = {} for i, feature in enumerate(AMINO_ACID_GROUPS): feature_stats = a.feature_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append(sub_list) # remove feature frequencies # print(feature_stats_clean) feat[feature] = [item for sublist in feature_stats_clean for item in sublist] for i, AA in enumerate(AMINO_ACIDS): feature_stats = a.amino_acid_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append(sub_list) # remove feature frequencies # print(feature_stats_clean) feat[AA] = [item for sublist in feature_stats_clean for item in sublist] ali_dict["statistics"] = feat return Response(ali_dict)
def main_func(self, positions, iteration,count,lock): # families # if not positions[1]: # families = self.families[positions[0]:] # else: # families = self.families[positions[0]:positions[1]] if self.signprot: signprot_fam = ProteinFamily.objects.get(name=self.signprot) families = ProteinFamily.objects.filter(slug__startswith=signprot_fam.slug+'_').all() # The '_' at the end is needed to skip the Alpha and Arrestin consensus sequences self.segments = ProteinSegment.objects.filter(partial=False, proteinfamily=self.signprot) else: families = self.families if self.input_slug: families = ProteinFamily.objects.filter(slug__startswith=self.input_slug) while count.value<len(families): with lock: family = families[count.value] count.value +=1 # for family in families: # get proteins in this family proteins = Protein.objects.filter(family__slug__startswith=family.slug, sequence_type__slug='wt', species__common_name="Human").prefetch_related('species', 'residue_numbering_scheme') # if family does not have human equivalents, like Class D1 if len(proteins)==0: proteins = Protein.objects.filter(family__slug__startswith=family.slug, sequence_type__slug='wt',).prefetch_related('species', 'residue_numbering_scheme') if proteins.count() <= 1: continue self.logger.info('Building alignment for {}'.format(family)) # create alignment a = Alignment() a.load_proteins(proteins) a.load_segments(self.segments) a.build_alignment() a.calculate_statistics() try: # Save alignment AlignmentConsensus.objects.create(slug=family.slug, alignment=pickle.dumps(a)) # Load alignment to ensure it works a = pickle.loads(AlignmentConsensus.objects.get(slug=family.slug).alignment) self.logger.info('Succesfully pickled {}'.format(family)) except: self.logger.error('Failed pickle for {}'.format(family)) self.logger.info('Completed building alignment for {}'.format(family)) # get (forced) consensus sequence from alignment object family_consensus = str() for segment, s in a.forced_consensus.items(): for gn, aa in s.items(): family_consensus += aa # create sequence type 'consensus' sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='consensus', defaults={'name': 'Consensus',}) if created: self.logger.info('Created protein sequence type {}'.format(sequence_type.name)) # create a protein record consensus_name = family.name + " consensus" residue_numbering_scheme = proteins[0].residue_numbering_scheme up = dict() up['entry_name'] = slugify(consensus_name) if Protein.objects.filter(entry_name=up['entry_name']).exists(): up['entry_name'] += "-" + family.slug.split('_')[0] up['source'] = "OTHER" up['species_latin_name'] = proteins[0].species.latin_name up['species_common_name'] = proteins[0].species.common_name up['sequence'] = family_consensus up['names'] = up['genes'] = [] self.create_protein(consensus_name, family, sequence_type, residue_numbering_scheme, False, up) # get protein anomalies in family all_constrictions = [] constriction_freq = dict() consensus_pas = dict() # a constriction has to be in all sequences to be included in the consensus pcs = ProteinConformation.objects.filter(protein__in=proteins, state__slug=settings.DEFAULT_PROTEIN_STATE).prefetch_related('protein_anomalies') for pc in pcs: pas = pc.protein_anomalies.all().prefetch_related('generic_number__protein_segment', 'anomaly_type') for pa in pas: pa_label = pa.generic_number.label pa_type = pa.anomaly_type.slug pa_segment_slug = pa.generic_number.protein_segment.slug # bulges are directly added to the consensus list if pa_type == 'bulge': if pa_segment_slug not in consensus_pas: consensus_pas[pa_segment_slug] = [] if pa not in consensus_pas[pa_segment_slug]: consensus_pas[pa_segment_slug].append(pa) # a constriction's frequency is counted else: if pa not in all_constrictions: all_constrictions.append(pa) if pa_label in constriction_freq: constriction_freq[pa_label] += 1 else: constriction_freq[pa_label] = 1 # go through constrictions to see which ones should be included in the consensus for pa in all_constrictions: pa_label = pa.generic_number.label pa_segment_slug = pa.generic_number.protein_segment.slug freq = constriction_freq[pa_label] # is the constriction in all sequences? if freq == len(all_constrictions): if pa_segment_slug not in consensus_pas: consensus_pas[pa_segment_slug] = [] consensus_pas[pa_segment_slug].append(pa) # create residues pc = ProteinConformation.objects.get(protein__entry_name=up['entry_name'], state__slug=settings.DEFAULT_PROTEIN_STATE) segment_info = self.get_segment_residue_information(a.forced_consensus) ref_positions, segment_starts, segment_aligned_starts, segment_ends, segment_aligned_ends = segment_info for segment_slug, s in a.forced_consensus.items(): if self.signprot: segment = ProteinSegment.objects.get(slug=segment_slug, proteinfamily=self.signprot) else: segment = ProteinSegment.objects.get(slug=segment_slug) if segment_slug in consensus_pas: protein_anomalies = consensus_pas[segment_slug] else: protein_anomalies = [] if segment_slug in segment_starts: if self.signprot: create_or_update_residues_in_segment(pc, segment, segment_starts[segment_slug], segment_aligned_starts[segment_slug], segment_ends[segment_slug], segment_aligned_ends[segment_slug], self.schemes, ref_positions, protein_anomalies, True, self.signprot) else: create_or_update_residues_in_segment(pc, segment, segment_starts[segment_slug], segment_aligned_starts[segment_slug], segment_ends[segment_slug], segment_aligned_ends[segment_slug], self.schemes, ref_positions, protein_anomalies, True)
def get(self, request, slug=None, segments=None, latin_name=None, statistics=False): if slug is not None: # Check for specific species if latin_name is not None: ps = Protein.objects.filter(sequence_type__slug='wt', source__id=1, family__slug__startswith=slug, species__latin_name=latin_name) else: ps = Protein.objects.filter(sequence_type__slug='wt', source__id=1, family__slug__startswith=slug) # take the numbering scheme from the first protein #s_slug = Protein.objects.get(entry_name=ps[0]).residue_numbering_scheme_id s_slug = ps[0].residue_numbering_scheme_id protein_family = ps[0].family.slug[:3] gen_list = [] segment_list = [] if segments is not None: input_list = segments.split(",") # fetch a list of all segments protein_segments = ProteinSegment.objects.filter( partial=False).values_list('slug', flat=True) for s in input_list: # add to segment list if s in protein_segments: segment_list.append(s) # get generic numbering object for generic positions else: # make sure the query works for all positions gen_object = ResidueGenericNumberEquivalent.objects.get( label=s, scheme__id=s_slug) gen_object.properties = {} gen_list.append(gen_object) # fetch all complete protein_segments ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False) if int(protein_family) < 100: ss = [s for s in ss if s.proteinfamily == 'GPCR'] elif protein_family == "100": ss = [s for s in ss if s.proteinfamily == 'Gprotein'] elif protein_family == "200": ss = [s for s in ss if s.proteinfamily == 'Arrestin'] # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_proteins(ps) # load generic numbers and TMs seperately if gen_list: a.load_segments(gen_list) a.load_segments(ss) # build the alignment data matrix a.build_alignment() a.calculate_statistics() residue_list = [] for aa in a.full_consensus: residue_list.append(aa.amino_acid) # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', { 'a': a }).split("\n") # convert the list to a dict ali_dict = OrderedDict({}) for row in response: if row.startswith(">"): k = row[1:] else: ali_dict[k] = row k = False ali_dict['CONSENSUS'] = ''.join(residue_list) # render statistics for output if statistics == True: feat = {} for i, feature in enumerate(AMINO_ACID_GROUPS): feature_stats = a.feature_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append( sub_list) # remove feature frequencies # print(feature_stats_clean) feat[feature] = [ item for sublist in feature_stats_clean for item in sublist ] for i, AA in enumerate(AMINO_ACIDS): feature_stats = a.amino_acid_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append( sub_list) # remove feature frequencies # print(feature_stats_clean) feat[AA] = [ item for sublist in feature_stats_clean for item in sublist ] ali_dict["statistics"] = feat return Response(ali_dict)
def get(self, request, slug=None, segments=None, latin_name=None, statistics=False): if slug is not None: # Check for specific species if latin_name is not None: ps = Protein.objects.filter(sequence_type__slug='wt', source__id=1, family__slug__startswith=slug, species__latin_name=latin_name) else: ps = Protein.objects.filter(sequence_type__slug='wt', source__id=1, family__slug__startswith=slug) # take the numbering scheme from the first protein #s_slug = Protein.objects.get(entry_name=ps[0]).residue_numbering_scheme_id s_slug = ps[0].residue_numbering_scheme_id protein_family = ps[0].family.slug[:3] gen_list = [] segment_list = [] if segments is not None: input_list = segments.split(",") # fetch a list of all segments protein_segments = ProteinSegment.objects.filter(partial=False).values_list('slug', flat=True) for s in input_list: # add to segment list if s in protein_segments: segment_list.append(s) # get generic numbering object for generic positions else: # make sure the query works for all positions gen_object = ResidueGenericNumberEquivalent.objects.get(label=s, scheme__id=s_slug) gen_object.properties = {} gen_list.append(gen_object) # fetch all complete protein_segments ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) else: ss = ProteinSegment.objects.filter(partial=False) if int(protein_family) < 100: ss = [ s for s in ss if s.proteinfamily == 'GPCR'] elif protein_family == "100": ss = [ s for s in ss if s.proteinfamily == 'Gprotein'] elif protein_family == "200": ss = [ s for s in ss if s.proteinfamily == 'Arrestin'] # create an alignment object a = Alignment() a.show_padding = False # load data from selection into the alignment a.load_proteins(ps) # load generic numbers and TMs seperately if gen_list: a.load_segments(gen_list) a.load_segments(ss) # build the alignment data matrix a.build_alignment() a.calculate_statistics() residue_list = [] for aa in a.full_consensus: residue_list.append(aa.amino_acid) # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', {'a': a}).split("\n") # convert the list to a dict ali_dict = OrderedDict({}) for row in response: if row.startswith(">"): k = row[1:] else: ali_dict[k] = row k = False ali_dict['CONSENSUS'] = ''.join(residue_list) # render statistics for output if statistics == True: feat = {} for i, feature in enumerate(AMINO_ACID_GROUPS): feature_stats = a.feature_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append(sub_list) # remove feature frequencies # print(feature_stats_clean) feat[feature] = [item for sublist in feature_stats_clean for item in sublist] for i, AA in enumerate(AMINO_ACIDS): feature_stats = a.amino_acid_stats[i] feature_stats_clean = [] for d in feature_stats: sub_list = [x[0] for x in d] feature_stats_clean.append(sub_list) # remove feature frequencies # print(feature_stats_clean) feat[AA] = [item for sublist in feature_stats_clean for item in sublist] ali_dict["statistics"] = feat return Response(ali_dict)
def main_func(self, positions, iteration): # families if not positions[1]: families = self.families[positions[0]:] else: families = self.families[positions[0]:positions[1]] for family in families: # get proteins in this family proteins = Protein.objects.filter(family__slug__startswith=family.slug, sequence_type__slug='wt', species__id=1).prefetch_related('species', 'residue_numbering_scheme') if proteins.count() <= 1: continue self.logger.info('Building alignment for {}'.format(family)) # create alignment a = Alignment() a.load_proteins(proteins) a.load_segments(self.segments) a.build_alignment() a.calculate_statistics() self.logger.info('Completed building alignment for {}'.format(family)) # get (forced) consensus sequence from alignment object family_consensus = str() for segment, s in a.forced_consensus.items(): for gn, aa in s.items(): family_consensus += aa # create sequence type 'consensus' sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='consensus', defaults={'name': 'Consensus',}) if created: self.logger.info('Created protein sequence type {}'.format(sequence_type.name)) # create a protein record consensus_name = family.name + " consensus" residue_numbering_scheme = proteins[0].residue_numbering_scheme up = dict() up['entry_name'] = slugify(consensus_name) if Protein.objects.filter(entry_name=up['entry_name']).exists(): up['entry_name'] += "-" + family.slug.split('_')[0] up['source'] = "OTHER" up['species_latin_name'] = proteins[0].species.latin_name up['species_common_name'] = proteins[0].species.common_name up['sequence'] = family_consensus up['names'] = up['genes'] = [] self.create_protein(consensus_name, family, sequence_type, residue_numbering_scheme, False, up) # get protein anomalies in family all_constrictions = [] constriction_freq = dict() consensus_pas = dict() # a constriction has to be in all sequences to be included in the consensus pcs = ProteinConformation.objects.filter(protein__in=proteins, state__slug=settings.DEFAULT_PROTEIN_STATE).prefetch_related('protein_anomalies') for pc in pcs: pas = pc.protein_anomalies.all().prefetch_related('generic_number__protein_segment', 'anomaly_type') for pa in pas: pa_label = pa.generic_number.label pa_type = pa.anomaly_type.slug pa_segment_slug = pa.generic_number.protein_segment.slug # bulges are directly added to the consensus list if pa_type == 'bulge': if pa_segment_slug not in consensus_pas: consensus_pas[pa_segment_slug] = [] if pa not in consensus_pas[pa_segment_slug]: consensus_pas[pa_segment_slug].append(pa) # a constriction's frequency is counted else: if pa not in all_constrictions: all_constrictions.append(pa) if pa_label in constriction_freq: constriction_freq[pa_label] += 1 else: constriction_freq[pa_label] = 1 # go through constrictions to see which ones should be included in the consensus for pa in all_constrictions: pa_label = pa.generic_number.label pa_segment_slug = pa.generic_number.protein_segment.slug freq = constriction_freq[pa_label] # is the constriction in all sequences? if freq == len(all_constrictions): if pa_segment_slug not in consensus_pas: consensus_pas[pa_segment_slug] = [] consensus_pas[pa_segment_slug].append(pa) # create residues pc = ProteinConformation.objects.get(protein__entry_name=up['entry_name'], state__slug=settings.DEFAULT_PROTEIN_STATE) segment_info = self.get_segment_residue_information(a.forced_consensus) ref_positions, segment_starts, segment_aligned_starts, segment_ends, segment_aligned_ends = segment_info for segment_slug, s in a.forced_consensus.items(): segment = ProteinSegment.objects.get(slug=segment_slug) if segment_slug in consensus_pas: protein_anomalies = consensus_pas[segment_slug] else: protein_anomalies = [] if segment_slug in segment_starts: create_or_update_residues_in_segment(pc, segment, segment_starts[segment_slug], segment_aligned_starts[segment_slug], segment_ends[segment_slug], segment_aligned_ends[segment_slug], self.schemes, ref_positions, protein_anomalies, True)
def run_build(self): for i in self.data[1:]: i = [ None if j == '-' else float(j) if '.' in j and len(j) == 3 else j for j in i ] pdb, main_temp, version, overall_all, overall_backbone, TM_all, TM_backbone, H8, ICL1, ECL1, ICL2, ECL2, ECL3, notes = i target_structure = Structure.objects.get( pdb_code__index=pdb.upper()) main_template = Structure.objects.get( pdb_code__index=main_temp.upper()) a = Alignment() a.load_reference_protein( target_structure.protein_conformation.protein.parent) a.load_proteins( [main_template.protein_conformation.protein.parent]) segments = Residue.objects.filter( protein_conformation__protein=target_structure. protein_conformation.protein.parent).order_by( 'protein_segment__id').distinct( 'protein_segment__id').values_list('protein_segment', flat=True) a.load_segments(ProteinSegment.objects.filter(id__in=segments)) a.build_alignment() a.remove_non_generic_numbers_from_alignment() a.calculate_similarity() seq_sim = a.proteins[1].similarity seq_id = a.proteins[1].identity smr, created = StructureModelRMSD.objects.get_or_create( target_structure=target_structure, main_template=main_template, version='{}-{}-{}'.format(version[-4:], version[3:5], version[:2]), seq_id=seq_id, seq_sim=seq_sim, overall_all=overall_all, overall_backbone=overall_backbone, TM_all=TM_all, TM_backbone=TM_backbone, H8=H8, ICL1=ICL1, ECL1=ECL1, ICL2=ICL2, ECL2=ECL2, ECL3=ECL3, notes=notes)
def main_func(self, positions, iteration): # families if not positions[1]: families = self.families[positions[0]:] else: families = self.families[positions[0]:positions[1]] for family in families: # get proteins in this family proteins = Protein.objects.filter( family__slug__startswith=family.slug, sequence_type__slug='wt', species__common_name="Human").prefetch_related( 'species', 'residue_numbering_scheme') if proteins.count() <= 1: continue self.logger.info('Building alignment for {}'.format(family)) # create alignment a = Alignment() a.load_proteins(proteins) a.load_segments(self.segments) a.build_alignment() a.calculate_statistics() self.logger.info( 'Completed building alignment for {}'.format(family)) # get (forced) consensus sequence from alignment object family_consensus = str() for segment, s in a.forced_consensus.items(): for gn, aa in s.items(): family_consensus += aa # create sequence type 'consensus' sequence_type, created = ProteinSequenceType.objects.get_or_create( slug='consensus', defaults={ 'name': 'Consensus', }) if created: self.logger.info('Created protein sequence type {}'.format( sequence_type.name)) # create a protein record consensus_name = family.name + " consensus" residue_numbering_scheme = proteins[0].residue_numbering_scheme up = dict() up['entry_name'] = slugify(consensus_name) if Protein.objects.filter(entry_name=up['entry_name']).exists(): up['entry_name'] += "-" + family.slug.split('_')[0] up['source'] = "OTHER" up['species_latin_name'] = proteins[0].species.latin_name up['species_common_name'] = proteins[0].species.common_name up['sequence'] = family_consensus up['names'] = up['genes'] = [] self.create_protein(consensus_name, family, sequence_type, residue_numbering_scheme, False, up) # get protein anomalies in family all_constrictions = [] constriction_freq = dict() consensus_pas = dict( ) # a constriction has to be in all sequences to be included in the consensus pcs = ProteinConformation.objects.filter( protein__in=proteins, state__slug=settings.DEFAULT_PROTEIN_STATE).prefetch_related( 'protein_anomalies') for pc in pcs: pas = pc.protein_anomalies.all().prefetch_related( 'generic_number__protein_segment', 'anomaly_type') for pa in pas: pa_label = pa.generic_number.label pa_type = pa.anomaly_type.slug pa_segment_slug = pa.generic_number.protein_segment.slug # bulges are directly added to the consensus list if pa_type == 'bulge': if pa_segment_slug not in consensus_pas: consensus_pas[pa_segment_slug] = [] if pa not in consensus_pas[pa_segment_slug]: consensus_pas[pa_segment_slug].append(pa) # a constriction's frequency is counted else: if pa not in all_constrictions: all_constrictions.append(pa) if pa_label in constriction_freq: constriction_freq[pa_label] += 1 else: constriction_freq[pa_label] = 1 # go through constrictions to see which ones should be included in the consensus for pa in all_constrictions: pa_label = pa.generic_number.label pa_segment_slug = pa.generic_number.protein_segment.slug freq = constriction_freq[pa_label] # is the constriction in all sequences? if freq == len(all_constrictions): if pa_segment_slug not in consensus_pas: consensus_pas[pa_segment_slug] = [] consensus_pas[pa_segment_slug].append(pa) # create residues pc = ProteinConformation.objects.get( protein__entry_name=up['entry_name'], state__slug=settings.DEFAULT_PROTEIN_STATE) segment_info = self.get_segment_residue_information( a.forced_consensus) ref_positions, segment_starts, segment_aligned_starts, segment_ends, segment_aligned_ends = segment_info for segment_slug, s in a.forced_consensus.items(): segment = ProteinSegment.objects.get(slug=segment_slug) if segment_slug in consensus_pas: protein_anomalies = consensus_pas[segment_slug] else: protein_anomalies = [] if segment_slug in segment_starts: create_or_update_residues_in_segment( pc, segment, segment_starts[segment_slug], segment_aligned_starts[segment_slug], segment_ends[segment_slug], segment_aligned_ends[segment_slug], self.schemes, ref_positions, protein_anomalies, True)
def get(self, request, proteins=None, segments=None): if proteins is not None: protein_list = proteins.split(",") # first in API should be reference ps = Protein.objects.filter(sequence_type__slug='wt', entry_name__in=protein_list[1:]) reference = Protein.objects.filter(sequence_type__slug='wt', entry_name__in=[protein_list[0]]) # take the numbering scheme from the first protein s_slug = Protein.objects.get(entry_name=protein_list[0]).residue_numbering_scheme_id if segments is not None: input_list = segments.split(",") # fetch a list of all segments protein_segments = ProteinSegment.objects.filter(partial=False).values_list('slug', flat=True) gen_list = [] segment_list = [] for s in input_list: # add to segment list if s in protein_segments: segment_list.append(s) # get generic numbering object for generic positions else: # make sure the query works for all positions gen_object = ResidueGenericNumberEquivalent.objects.get(label=s, scheme__id=s_slug) gen_object.properties = {} gen_list.append(gen_object) # fetch all complete protein_segments ss = ProteinSegment.objects.filter(slug__in=segment_list, partial=False) # create an alignment object a = Alignment() a.show_padding = False # load data from API into the alignment a.load_reference_protein(reference) a.load_proteins(ps) # load generic numbers and TMs seperately a.load_segments(gen_list) a.load_segments(ss) # build the alignment data matrix a.build_alignment() # calculate identity and similarity of each row compared to the reference a.calculate_similarity() # render the fasta template as string response = render_to_string('alignment/alignment_fasta.html', {'a': a}).split("\n") # convert the list to a dict ali_dict = {} k = False num = 0 for i, row in enumerate(response): if row.startswith(">"): k = row[1:] elif k: # add the query as 100 identical/similar to the beginning (like on the website) if num == 0: a.proteins[num].identity = 100 a.proteins[num].similarity = 100 # order dict after custom list keyorder = ["similarity","identity","AA"] ali_dict[k] = {"AA": row, "identity": int(str(a.proteins[num].identity).replace(" ","")), "similarity": int(str(a.proteins[num].similarity).replace(" ",""))} ali_dict[k] = OrderedDict(sorted(ali_dict[k].items(), key=lambda t: keyorder.index(t[0]))) num+=1 k = False ali_dict_ordered = OrderedDict(sorted(ali_dict.items(), key=lambda x: x[1]['similarity'], reverse=True)) return Response(ali_dict_ordered)