def handle(self, *args, **options): self.options = options if self.options['purge']: Residue.objects.filter( protein_conformation__protein__entry_name__endswith='_a', protein_conformation__protein__family__parent__parent__name= 'Alpha').delete() ProteinConformation.objects.filter( protein__entry_name__endswith='_a', protein__family__parent__parent__name='Alpha').delete() Protein.objects.filter( entry_name__endswith='_a', family__parent__parent__name='Alpha').delete() # Building protein and protconf objects for g protein structure in complex scs = SignprotComplex.objects.all() for sc in scs: self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is building' .format(sc)) try: # Alpha subunit try: alpha_protein = Protein.objects.get( entry_name=sc.structure.pdb_code.index.lower() + '_a') except: alpha_protein = Protein() alpha_protein.entry_name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.accession = None alpha_protein.name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.sequence = sc.protein.sequence alpha_protein.family = sc.protein.family alpha_protein.parent = sc.protein alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme alpha_protein.sequence_type = ProteinSequenceType.objects.get( slug='mod') alpha_protein.source = ProteinSource.objects.get( name='OTHER') alpha_protein.species = sc.protein.species alpha_protein.save() try: alpha_protconf = ProteinConformation.objects.get( protein__entry_name=sc.structure.pdb_code.index.lower( ) + '_a') except: alpha_protconf = ProteinConformation() alpha_protconf.protein = alpha_protein alpha_protconf.state = ProteinState.objects.get( slug='active') alpha_protconf.save() pdbp = PDBParser(PERMISSIVE=True, QUIET=True) s = pdbp.get_structure('struct', StringIO(sc.structure.pdb_data.pdb)) chain = s[0][sc.alpha] nums = [] for res in chain: try: res['CA'] nums.append(res.get_id()[1]) except: pass resis = Residue.objects.filter( protein_conformation__protein=sc.protein) num_i = 0 temp_seq2 = '' pdb_num_dict = OrderedDict() # Create first alignment based on sequence numbers for n in nums: if sc.structure.pdb_code.index == '6OIJ' and n < 30: nr = n + 6 else: nr = n pdb_num_dict[n] = [chain[n], resis.get(sequence_number=nr)] # Find mismatches mismatches = [] for n, res in pdb_num_dict.items(): if AA[res[0].get_resname()] != res[1].amino_acid: mismatches.append(res) pdb_lines = sc.structure.pdb_data.pdb.split('\n') seqadv = [] for l in pdb_lines: if l.startswith('SEQADV'): seqadv.append(l) mutations, shifted_mutations = OrderedDict(), OrderedDict() # Search for annotated engineered mutations in pdb SEQADV for s in seqadv: line_search = re.search( 'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)', s) if line_search != None: if line_search.group(2) == sc.alpha: if line_search.group( 4).strip() == sc.protein.accession: if line_search.group(3) == line_search.group( 6): mutations[int(line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] else: shifted_mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5), int(line_search.group(6)) ] else: # Exception for 6G79 if line_search.group(3) != line_search.group( 6) and 'CONFLICT' in line_search.group( 7): mutations[int(line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] # Exception for 5G53 if line_search.group( 4).strip() != sc.protein.accession: mutations[int(line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] remaining_mismatches = [] # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation for m in mismatches: num = m[0].get_id()[1] if num in mutations: if m[0].get_resname() != mutations[num][0] and m[ 1].amino_acid != AA[mutations[num][1]]: remaining_mismatches.append(m) elif num in shifted_mutations: remaining_mismatches.append(m) else: remaining_mismatches.append(m) ### sanity check # print(mutations) # print(shifted_mutations) # print(mismatches) # print(remaining_mismatches) # pprint.pprint(pdb_num_dict) # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment if len(remaining_mismatches ) > 0 and sc.structure.pdb_code.index not in [ '6OIJ', '6OY9', '6OYA' ]: ppb = PPBuilder() seq = '' for pp in ppb.build_peptides(chain, aa_only=False): seq += str(pp.get_sequence()) pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2, -1, -.5, -.1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) wt_pdb_dict = OrderedDict() pdb_wt_dict = OrderedDict() j, k = 0, 0 for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq, temp_seq): if ref != '-' and temp != '-': wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = resis[j] j += 1 k += 1 elif ref == '-': wt_pdb_dict[i] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i k += 1 elif temp == '-': wt_pdb_dict[resis[j]] = i pdb_wt_dict[i] = resis[j] j += 1 for i, r in enumerate(remaining_mismatches): # Adjust for shifted residue when residue is a match if r[0].get_id()[1] - remaining_mismatches[ i - 1][0].get_id()[1] > 1: pdb_num_dict[r[0].get_id()[1] - 1][1] = pdb_wt_dict[chain[ r[0].get_id()[1] - 1]] # Adjust for shifted residue when residue is mutated and it's logged in SEQADV if r[0].get_id()[1] in shifted_mutations: pdb_num_dict[r[0].get_id()[1]][1] = resis.get( sequence_number=shifted_mutations[ r[0].get_id()[1]][2]) # Adjust for shift else: pdb_num_dict[r[0].get_id()[1]][1] = pdb_wt_dict[ r[0]] bulked_residues = [] for key, val in pdb_num_dict.items(): # print(key, val) # sanity check res_obj = Residue() res_obj.sequence_number = val[0].get_id()[1] res_obj.amino_acid = AA[val[0].get_resname()] res_obj.display_generic_number = val[ 1].display_generic_number res_obj.generic_number = val[1].generic_number res_obj.protein_conformation = alpha_protconf res_obj.protein_segment = val[1].protein_segment bulked_residues.append(res_obj) Residue.objects.bulk_create(bulked_residues) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished' .format(sc)) except Exception as msg: print( 'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed' .format(sc)) print(msg) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed' .format(sc))
def can_create_arrestins(self, family, residue_numbering_scheme, accession, uniprot): # get/create protein source try: source, created = ProteinSource.objects.get_or_create( name=uniprot['source'], defaults={'name': uniprot['source']}) if created: self.logger.info('Created protein source ' + source.name) except IntegrityError: source = ProteinSource.objects.get(name=uniprot['source']) # get/create species try: species, created = Species.objects.get_or_create( latin_name=uniprot['species_latin_name'], defaults={ 'common_name': uniprot['species_common_name'], }) if created: self.logger.info('Created species ' + species.latin_name) except IntegrityError: species = Species.objects.get( latin_name=uniprot['species_latin_name']) # get/create protein sequence type # Wild-type for all sequences from source file try: sequence_type, created = ProteinSequenceType.objects.get_or_create( slug='wt', defaults={ 'slug': 'wt', 'name': 'Wild-type', }) if created: self.logger.info('Created protein sequence type Wild-type') except: self.logger.error( 'Failed creating protein sequence type Wild-type') # create protein p = Protein() p.family = family p.species = species p.source = source p.residue_numbering_scheme = residue_numbering_scheme p.sequence_type = sequence_type if accession: p.accession = accession p.entry_name = uniprot['entry_name'].lower() p.name = uniprot['names'][0] p.sequence = uniprot['sequence'] try: p.save() self.logger.info('Created protein {}'.format(p.entry_name)) except: self.logger.error('Failed creating protein {}'.format( p.entry_name)) # protein aliases for i, alias in enumerate(uniprot['names']): pcan = Protein.objects.get( entry_name=uniprot['entry_name'].lower()) a = ProteinAlias() a.protein = pcan a.name = alias a.position = i try: a.save() self.logger.info('Created protein alias ' + a.name + ' for protein ' + p.name) except: self.logger.error('Failed creating protein alias ' + a.name + ' for protein ' + p.name) # genes for i, gene in enumerate(uniprot['genes']): g = False try: g, created = Gene.objects.get_or_create(name=gene, species=species, position=i) if created: self.logger.info('Created gene ' + g.name + ' for protein ' + p.name) except IntegrityError: g = Gene.objects.get(name=gene, species=species, position=i) if g: pcan = Protein.objects.get( entry_name=uniprot['entry_name'].lower()) g.proteins.add(pcan) # structures for i, structure in enumerate(uniprot['structures']): try: res = structure[1] if res == '-': res = 0 structure, created = SignprotStructure.objects.get_or_create( PDB_code=structure[0], resolution=res) if created: self.logger.info('Created structure ' + structure.PDB_code + ' for protein ' + p.name) except IntegrityError: self.logger.error('Failed creating structure ' + structure.PDB_code + ' for protein ' + p.name) if g: pcan = Protein.objects.get( entry_name=uniprot['entry_name'].lower()) structure.origin.add(pcan) structure.save()
def create_protein(self, name, family, sequence_type, residue_numbering_scheme, accession, uniprot): # get/create protein source try: source, created = ProteinSource.objects.get_or_create( name=uniprot['source'], defaults={'name': uniprot['source']}) if created: self.logger.info('Created protein source ' + source.name) except IntegrityError: source = ProteinSource.objects.get(name=uniprot['source']) # get/create species try: species, created = Species.objects.get_or_create( latin_name=uniprot['species_latin_name'], defaults={ 'common_name': uniprot['species_common_name'], }) if created: self.logger.info('Created species ' + species.latin_name) except IntegrityError: species = Species.objects.get( latin_name=uniprot['species_latin_name']) # create protein p = Protein() p.family = family p.species = species p.source = source p.residue_numbering_scheme = residue_numbering_scheme p.sequence_type = sequence_type if accession: p.accession = accession p.entry_name = uniprot['entry_name'] p.name = name p.sequence = uniprot['sequence'] try: p.save() self.logger.info('Created protein {}'.format(p.entry_name)) except: self.logger.error('Failed creating protein {}'.format( p.entry_name)) # protein conformations try: ps, created = ProteinState.objects.get_or_create( slug=settings.DEFAULT_PROTEIN_STATE, defaults={'name': settings.DEFAULT_PROTEIN_STATE.title()}) except IntegrityError: ps = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) pc = ProteinConformation.objects.create(protein=p, state=ps) # protein aliases for i, alias in enumerate(uniprot['names']): a = ProteinAlias() a.protein = p a.name = alias a.position = i try: a.save() self.logger.info('Created protein alias ' + a.name + ' for protein ' + p.name) except: self.logger.error('Failed creating protein alias ' + a.name + ' for protein ' + p.name) # genes for i, gene in enumerate(uniprot['genes']): g = False try: g, created = Gene.objects.get_or_create(name=gene, species=species, position=i) if created: self.logger.info('Created gene ' + g.name + ' for protein ' + p.name) except IntegrityError: g = Gene.objects.get(name=gene, species=species, position=i) if g: g.proteins.add(p)
def cgn_creat_gproteins(self, family, residue_numbering_scheme, accession, uniprot): # get/create protein source try: source, created = ProteinSource.objects.get_or_create(name=uniprot['source'], defaults={'name': uniprot['source']}) if created: self.logger.info('Created protein source ' + source.name) except IntegrityError: source = ProteinSource.objects.get(name=uniprot['source']) # get/create species try: species, created = Species.objects.get_or_create(latin_name=uniprot['species_latin_name'], defaults={ 'common_name': uniprot['species_common_name'], }) if created: self.logger.info('Created species ' + species.latin_name) except IntegrityError: species = Species.objects.get(latin_name=uniprot['species_latin_name']) # get/create protein sequence type # Wild-type for all sequences from source file try: sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='wt', defaults={ 'slug': 'wt', 'name': 'Wild-type', }) if created: self.logger.info('Created protein sequence type Wild-type') except: self.logger.error('Failed creating protein sequence type Wild-type') # create protein p = Protein() p.family = family p.species = species p.source = source p.residue_numbering_scheme = residue_numbering_scheme p.sequence_type = sequence_type if accession: p.accession = accession p.entry_name = uniprot['entry_name'].lower() p.name = uniprot['names'][0].split('Guanine nucleotide-binding protein ')[1] p.sequence = uniprot['sequence'] try: p.save() self.logger.info('Created protein {}'.format(p.entry_name)) except: self.logger.error('Failed creating protein {}'.format(p.entry_name)) # protein aliases for i, alias in enumerate(uniprot['names']): pcgn = Protein.objects.get(entry_name=uniprot['entry_name'].lower()) a = ProteinAlias() a.protein = pcgn a.name = alias a.position = i try: a.save() self.logger.info('Created protein alias ' + a.name + ' for protein ' + p.name) except: self.logger.error('Failed creating protein alias ' + a.name + ' for protein ' + p.name) # genes for i, gene in enumerate(uniprot['genes']): g = False try: g, created = Gene.objects.get_or_create(name=gene, species=species, position=i) if created: self.logger.info('Created gene ' + g.name + ' for protein ' + p.name) except IntegrityError: g = Gene.objects.get(name=gene, species=species, position=i) if g: pcgn = Protein.objects.get(entry_name=uniprot['entry_name'].lower()) g.proteins.add(pcgn) # structures for i, structure in enumerate(uniprot['structures']): try: res = structure[1] if res == '-': res = 0 structure, created = SignprotStructure.objects.get_or_create(PDB_code=structure[0], resolution=res) if created: self.logger.info('Created structure ' + structure.PDB_code + ' for protein ' + p.name) except IntegrityError: self.logger.error('Failed creating structure ' + structure.PDB_code + ' for protein ' + p.name) if g: pcgn = Protein.objects.get(entry_name=uniprot['entry_name'].lower()) structure.origin.add(pcgn) structure.save()
def create_constructs(self, filenames): self.logger.info('CREATING CONSTRUCTS') # what files should be parsed? if not filenames: filenames = os.listdir(self.construct_data_dir) # parse files for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error('Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.select_related('protein__family', 'protein__species', 'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error('Parent protein {} for construct {} not found, aborting!'.format( sd['protein'], sd['name'])) continue if not Protein.objects.filter(name=sd['name']).exists(): # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod', defaults={'name': 'Modified'}) p.source, created = ProteinSource.objects.get_or_create(name='OTHER') p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info('Created construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) except Exception as e: print(e) self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) continue else: p = Protein.objects.get(name=sd['name']) if not ProteinConformation.objects.filter(protein=p).exists(): # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name)) except: self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name, p.entry_name)) # # create residue records # deletions = [] # deletions_list = [] # if 'deletions' in sd and sd['deletions']: # for t in sd['deletions']: # deletions += list(range(t[0],t[1]+1)) # deletions_list.append(str(t[0])+'-'+str(t[1])) # s = "," # deletion_string = s.join(deletions_list) # mutations = {} # if 'mutations' in sd and sd['mutations']: # for m in sd['mutations']: # res_num = m[1:-1] # mutations[res_num] = { # 'wt_res': m[0], # 'mut_res': m[-1], # 'full': m, # } # # Create construct record # c = Construct() # c.protein_conformation = pc # c.deletions = deletion_string # c.save() # Create Auxiliary proteins # if 'auxiliary_proteins' in sd and sd['auxiliary_proteins']: # ap = AuxProtein() # ap.construct = c # apct = AuxProteinType.objects.create() # ap.protein_type = apct # apct.save() # if 'remarks' in sd['auxiliary_proteins']: # ap.remarks = sd['auxiliary_proteins']['remarks'] # ap.save() # for step in sd['auxiliary_proteins']: # if 'type' in step and 'name' in step and'sequence' in step: # ap.protein_type = apct # ap.protein_type, created = AuxProteinType.objects.get_or_create() # ap.name = sd['auxiliary_proteins']['name'] # ap.uniprot_id = sd['auxiliary_proteins']['uniprot_id'] # ap.sequence = sd['auxiliary_proteins']['sequence'] #mutations if any to be included from mutation model along with reason of mutation # ap.position = sd['auxiliary_proteins']['position'] # ap.deletions = sd['auxiliary_proteins']['deletions'] # else: # self.logger.error('Auxiliary protein step incorrectly defined for {}'.format(p)) # # create expression records # if 'expression_sys' in sd and sd['expression_sys']: # ce = ConstructExpression() # ce.construct = c # ce.expression_system, created = ConstructExpressionSystem.objects.get_or_create(expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell']) # if 'remarks' in sd: # ce.remarks = sd['expression_sys']['remarks'] # ce.save() # # create solubilization records # if 'solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']: # so = ConstructSolubilization() # so.construct = c # cl = ChemicalList.objects.create() # so.chemical_list = cl # for step in sd['solubilization']['steps']: # if 'type' in step and 'item' in step and'concentration' in step: # chem = Chemical() # chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) # chem.name = step['item'] # chem.save() # cc = ChemicalConc() # cc.concentration = step['concentration'] # cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical # cc.save() # cl.chemicals.add(cc) # else: # self.logger.error('Solubilization step incorrectly defined for {}'.format(p)) # if 'remarks' in sd['solubilization']: # so.remarks = sd['solubilization']['remarks'] # so.save() # # create purification records # if 'purification' in sd and sd['purification'] and sd['purification']['steps']: # pu = ConstructPurification() # pu.construct = c # if 'remarks' in sd['purification']: # pu.remarks = sd['purification']['remarks'] # pu.save() # for step in sd['purification']['steps']: # if 'type' in step and 'description' in step: # pust = PurificationStep() # pust.description = step['description'] # pust.purification = pu # pust.purification_type, created = PurificationStepType.objects.get_or_create(name = step['type'] ) # 2 values returned by get_or_create # if created: # self.logger.info('Created purification step type {}'.format(pust.purification_type)) # pust.save() # else: # self.logger.error('Purification step incorrectly defined for {}'.format(p)) # # create crystallization records # if 'crystallization' in sd and sd['crystallization']: # cy = ConstructCrystallization() # cy.construct = c # cyt = CrystallizationMethodTypes.objects.create() # cy.crystal_type = cyt # cy.method = sd['crystallization']['method'] # cy.settings = sd['crystallization']['settings'] # cy.protein_conc = sd['crystallization']['protein_conc'] # cl = ChemicalList.objects.create() # cy.chemical_list = cl # for step in sd['crystallization']['chemicallist']: # if 'type' in step and 'item' in step and'concentration' in step: # chem = Chemical() # chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) # chem.name = step['item'] # chem.save() # cc = ChemicalConc() # cc.concentration = step['concentration'] # cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical # cc.save() # cl.chemicals.add(cc) # else: # self.logger.error('Crystallization step incorrectly defined for {}'.format(p)) # cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP'] # cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume'] # cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume'] # cy.temp = sd['crystallization']['temperature'] # cy.ph = sd['crystallization']['ph'] # if 'remarks' in sd['crystallization']: # cy.remarks = sd['crystallization']['remarks'] # cy.save() # # fusion proteins # split_segments = {} # if 'fusion_proteins' in sd and sd['fusion_proteins']: # for fp in sd['fusion_proteins']: # fp_start = Residue.objects.get(protein_conformation=ppc, # sequence_number=fp['positions'][0]) # fp_end = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][1]) # # if the fusion protein is inserted within only one segment (the usual case), split that # # segment into two segments # if fp_start and fp_start.protein_segment == fp_end.protein_segment: # # get/create split protein segments # segment_before, created = ProteinSegment.objects.get_or_create( # slug=fp_start.protein_segment.slug+"_1", defaults={ # 'name': fp_start.protein_segment.name, # 'category': fp_start.protein_segment.category, # 'partial': True}) # segment_after, created = ProteinSegment.objects.get_or_create( # slug=fp_start.protein_segment.slug+"_2", defaults={ # 'name': fp_start.protein_segment.name, # 'category': fp_start.protein_segment.category, # 'partial': True}) # # keep track of information about split segments # split_segments[fp_start.protein_segment.slug] = { # 'start': { # 'sequence_number': fp['positions'][0], # 'segment': segment_before, # }, # 'end': { # 'sequence_number': fp['positions'][1], # 'segment': segment_after, # }, # } # # get/insert fusion protein # fusion, create = ProteinFusion.objects.get_or_create(name=fp['name'], defaults={ # 'sequence': fp['sequence']}) # # create relationship with protein # ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion, # segment_before=segment_before, segment_after=segment_after) # prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related( # 'protein_conformation__protein', 'protein_segment', 'generic_number', # 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') # updated_sequence = '' # for pr in prs: # if pr.sequence_number not in deletions: # r = Residue() # r.protein_conformation = pc # r.generic_number = pr.generic_number # r.display_generic_number = pr.display_generic_number # r.sequence_number = pr.sequence_number # # check for split segments # if pr.protein_segment.slug in split_segments: # rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number'] # rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number'] # if r.sequence_number <= rsns: # r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment'] # elif r.sequence_number >= rsne: # r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment'] # else: # r.protein_segment = pr.protein_segment # # amino acid, check for mutations # if r.sequence_number in mutations: # if mutations[r.sequence_number]['wt_res'] == pr.amino_acid: # r.amino_acid = mutations[r.sequence_number]['mut_res'] # else: # self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ # + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, # ppc.protein.entry_name)) # else: # r.amino_acid = pr.amino_acid # # save amino acid to updated sequence # updated_sequence += r.amino_acid # # save residue before populating M2M relations # r.save() # # alternative generic numbers # agns = pr.alternative_generic_numbers.all() # for agn in agns: # r.alternative_generic_numbers.add(agn) # # update sequence # p.sequence = updated_sequence # p.save() self.logger.info('COMPLETED CREATING CONSTRUCTS')
def create_protein(self, name, family, sequence_type, residue_numbering_scheme, accession, uniprot): # get/create protein source try: source, created = ProteinSource.objects.get_or_create(name=uniprot['source'], defaults={'name': uniprot['source']}) if created: self.logger.info('Created protein source ' + source.name) except IntegrityError: source = ProteinSource.objects.get(name=uniprot['source']) # get/create species try: species, created = Species.objects.get_or_create(latin_name=uniprot['species_latin_name'], defaults={ 'common_name': uniprot['species_common_name'], }) if created: self.logger.info('Created species ' + species.latin_name) except IntegrityError: species = Species.objects.get(latin_name=uniprot['species_latin_name']) # create protein p = Protein() p.family = family p.species = species p.source = source p.residue_numbering_scheme = residue_numbering_scheme p.sequence_type = sequence_type if accession: p.accession = accession p.entry_name = uniprot['entry_name'] p.name = name p.sequence = uniprot['sequence'] try: p.save() self.logger.info('Created protein {}'.format(p.entry_name)) except Exception as e: self.logger.error('Failed creating protein {} {}'.format(p.entry_name, str(e))) # protein conformations try: ps, created = ProteinState.objects.get_or_create(slug=settings.DEFAULT_PROTEIN_STATE, defaults={'name': settings.DEFAULT_PROTEIN_STATE.title()}) except IntegrityError: ps = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) pc = ProteinConformation.objects.create(protein=p, state=ps) # protein aliases for i, alias in enumerate(uniprot['names']): a = ProteinAlias() a.protein = p a.name = alias a.position = i try: a.save() self.logger.info('Created protein alias ' + a.name + ' for protein ' + p.name) except: self.logger.error('Failed creating protein alias ' + a.name + ' for protein ' + p.name) # genes for i, gene in enumerate(uniprot['genes']): g = False try: g, created = Gene.objects.get_or_create(name=gene, species=species, position=i) if created: self.logger.info('Created gene ' + g.name + ' for protein ' + p.name) except IntegrityError: g = Gene.objects.get(name=gene, species=species, position=i) if g: g.proteins.add(p)
def create_constructs(self, filenames): self.logger.info('CREATING CONSTRUCTS') # what files should be parsed? if not filenames: filenames = os.listdir(self.construct_data_dir) # parse files for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error('Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.select_related('protein__family', 'protein__species', 'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error('Parent protein {} for construct {} not found, aborting!'.format( sd['protein'], sd['name'])) continue # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod', defaults={'name': 'Modified'}) p.source, created = ProteinSource.objects.get_or_create(name='OTHER') p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info('Created construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) except Exception as e: print(e) self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name)) except: self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name, p.entry_name)) # create residue records deletions = [] deletions_list = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0],t[1]+1)) deletions_list.append(str(t[0])+'-'+str(t[1])) s = "," deletion_string = s.join(deletions_list) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = m[1:-1] mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } # Create construct record c = Construct() c.protein_conformation = pc c.deletions = deletion_string c.save() # Create Auxiliary proteins # if 'auxiliary_proteins' in sd and sd['auxiliary_proteins']: # ap = AuxProtein() # ap.construct = c # apct = AuxProteinType.objects.create() # ap.protein_type = apct # apct.save() # if 'remarks' in sd['auxiliary_proteins']: # ap.remarks = sd['auxiliary_proteins']['remarks'] # ap.save() # for step in sd['auxiliary_proteins']: # if 'type' in step and 'name' in step and'sequence' in step: # ap.protein_type = apct # ap.protein_type, created = AuxProteinType.objects.get_or_create() # ap.name = sd['auxiliary_proteins']['name'] # ap.uniprot_id = sd['auxiliary_proteins']['uniprot_id'] # ap.sequence = sd['auxiliary_proteins']['sequence'] #mutations if any to be included from mutation model along with reason of mutation # ap.position = sd['auxiliary_proteins']['position'] # ap.deletions = sd['auxiliary_proteins']['deletions'] # else: # self.logger.error('Auxiliary protein step incorrectly defined for {}'.format(p)) # create expression records if 'expression_sys' in sd and sd['expression_sys']: ce = ConstructExpression() ce.construct = c ce.expression_system, created = ConstructExpressionSystem.objects.get_or_create(expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell']) if 'remarks' in sd: ce.remarks = sd['expression_sys']['remarks'] ce.save() # create solubilization records if 'solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']: so = ConstructSolubilization() so.construct = c cl = ChemicalList.objects.create() so.chemical_list = cl for step in sd['solubilization']['steps']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Solubilization step incorrectly defined for {}'.format(p)) if 'remarks' in sd['solubilization']: so.remarks = sd['solubilization']['remarks'] so.save() # create purification records if 'purification' in sd and sd['purification'] and sd['purification']['steps']: pu = ConstructPurification() pu.construct = c if 'remarks' in sd['purification']: pu.remarks = sd['purification']['remarks'] pu.save() for step in sd['purification']['steps']: if 'type' in step and 'description' in step: pust = PurificationStep() pust.description = step['description'] pust.purification = pu pust.purification_type, created = PurificationStepType.objects.get_or_create(name = step['type'] ) # 2 values returned by get_or_create if created: self.logger.info('Created purification step type {}'.format(pust.purification_type)) pust.save() else: self.logger.error('Purification step incorrectly defined for {}'.format(p)) # create crystallization records if 'crystallization' in sd and sd['crystallization']: cy = ConstructCrystallization() cy.construct = c cyt = CrystallizationMethodTypes.objects.create() cy.crystal_type = cyt cy.method = sd['crystallization']['method'] cy.settings = sd['crystallization']['settings'] cy.protein_conc = sd['crystallization']['protein_conc'] cl = ChemicalList.objects.create() cy.chemical_list = cl for step in sd['crystallization']['chemicallist']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Crystallization step incorrectly defined for {}'.format(p)) cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP'] cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume'] cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume'] cy.temp = sd['crystallization']['temperature'] cy.ph = sd['crystallization']['ph'] if 'remarks' in sd['crystallization']: cy.remarks = sd['crystallization']['remarks'] cy.save() # fusion proteins split_segments = {} if 'fusion_proteins' in sd and sd['fusion_proteins']: for fp in sd['fusion_proteins']: fp_start = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][0]) fp_end = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][1]) # if the fusion protein is inserted within only one segment (the usual case), split that # segment into two segments if fp_start and fp_start.protein_segment == fp_end.protein_segment: # get/create split protein segments segment_before, created = ProteinSegment.objects.get_or_create( slug=fp_start.protein_segment.slug+"_1", defaults={ 'name': fp_start.protein_segment.name, 'category': fp_start.protein_segment.category, 'partial': True}) segment_after, created = ProteinSegment.objects.get_or_create( slug=fp_start.protein_segment.slug+"_2", defaults={ 'name': fp_start.protein_segment.name, 'category': fp_start.protein_segment.category, 'partial': True}) # keep track of information about split segments split_segments[fp_start.protein_segment.slug] = { 'start': { 'sequence_number': fp['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': fp['positions'][1], 'segment': segment_after, }, } # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create(name=fp['name'], defaults={ 'sequence': fp['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number]['wt_res'] == pr.amino_acid: r.amino_acid = mutations[r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save() self.logger.info('COMPLETED CREATING CONSTRUCTS')
def handle(self, *args, **options): self.options = options if self.options['purge']: Residue.objects.filter( protein_conformation__protein__entry_name__endswith='_a', protein_conformation__protein__family__parent__parent__name= 'Alpha').delete() ProteinConformation.objects.filter( protein__entry_name__endswith='_a', protein__family__parent__parent__name='Alpha').delete() Protein.objects.filter( entry_name__endswith='_a', family__parent__parent__name='Alpha').delete() SignprotStructureExtraProteins.objects.all().delete() SignprotStructure.objects.all().delete() if not options['only_signprot_structures']: # Building protein and protconf objects for g protein structure in complex scs = SignprotComplex.objects.all() for sc in scs: self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is building' .format(sc)) try: # Alpha subunit try: alpha_protein = Protein.objects.get( entry_name=sc.structure.pdb_code.index.lower() + '_a') except: alpha_protein = Protein() alpha_protein.entry_name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.accession = None alpha_protein.name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.sequence = sc.protein.sequence alpha_protein.family = sc.protein.family alpha_protein.parent = sc.protein alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme alpha_protein.sequence_type = ProteinSequenceType.objects.get( slug='mod') alpha_protein.source = ProteinSource.objects.get( name='OTHER') alpha_protein.species = sc.protein.species alpha_protein.save() try: alpha_protconf = ProteinConformation.objects.get( protein__entry_name=sc.structure.pdb_code.index. lower() + '_a') except: alpha_protconf = ProteinConformation() alpha_protconf.protein = alpha_protein alpha_protconf.state = ProteinState.objects.get( slug='active') alpha_protconf.save() pdbp = PDBParser(PERMISSIVE=True, QUIET=True) s = pdbp.get_structure('struct', StringIO(sc.structure.pdb_data.pdb)) chain = s[0][sc.alpha] nums = [] for res in chain: try: res['CA'] nums.append(res.get_id()[1]) except: pass resis = Residue.objects.filter( protein_conformation__protein=sc.protein) num_i = 0 temp_seq2 = '' pdb_num_dict = OrderedDict() # Create first alignment based on sequence numbers for n in nums: if sc.structure.pdb_code.index == '6OIJ' and n < 30: nr = n + 6 else: nr = n pdb_num_dict[n] = [ chain[n], resis.get(sequence_number=nr) ] # Find mismatches mismatches = [] for n, res in pdb_num_dict.items(): if AA[res[0].get_resname()] != res[1].amino_acid: mismatches.append(res) pdb_lines = sc.structure.pdb_data.pdb.split('\n') seqadv = [] for l in pdb_lines: if l.startswith('SEQADV'): seqadv.append(l) mutations, shifted_mutations = OrderedDict(), OrderedDict() # Search for annotated engineered mutations in pdb SEQADV for s in seqadv: line_search = re.search( 'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)', s) if line_search != None: if line_search.group(2) == sc.alpha: if line_search.group( 4).strip() == sc.protein.accession: if line_search.group( 3) == line_search.group(6): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] else: shifted_mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5), int(line_search.group(6)) ] else: # Exception for 6G79 if line_search.group( 3 ) != line_search.group( 6 ) and 'CONFLICT' in line_search.group(7): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] # Exception for 5G53 if line_search.group( 4).strip() != sc.protein.accession: mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] remaining_mismatches = [] # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation for m in mismatches: num = m[0].get_id()[1] if num in mutations: if m[0].get_resname() != mutations[num][0] and m[ 1].amino_acid != AA[mutations[num][1]]: remaining_mismatches.append(m) elif num in shifted_mutations: remaining_mismatches.append(m) else: remaining_mismatches.append(m) ### sanity check # print(sc) # print(mutations) # print(shifted_mutations) # print(mismatches) # print('======') # print(remaining_mismatches) # pprint.pprint(pdb_num_dict) # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment if len(remaining_mismatches ) > 0 and sc.structure.pdb_code.index not in [ '6OIJ', '6OY9', '6OYA', '6LPB', '6WHA' ]: ppb = PPBuilder() seq = '' for pp in ppb.build_peptides(chain, aa_only=False): seq += str(pp.get_sequence()) pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2, -1, -.5, -.1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) wt_pdb_dict = OrderedDict() pdb_wt_dict = OrderedDict() j, k = 0, 0 for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq, temp_seq): # print(i, ref, temp) # alignment check if ref != '-' and temp != '-': wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]] [0]] = resis[j] j += 1 k += 1 elif ref == '-': wt_pdb_dict[i] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i k += 1 elif temp == '-': wt_pdb_dict[resis[j]] = i pdb_wt_dict[i] = resis[j] j += 1 for i, r in enumerate(remaining_mismatches): # Adjust for shifted residue when residue is a match if r[0].get_id()[1] - remaining_mismatches[ i - 1][0].get_id()[1] > 1: pdb_num_dict[r[0].get_id()[1] - 1][1] = pdb_wt_dict[chain[ r[0].get_id()[1] - 1]] # Adjust for shifted residue when residue is mutated and it's logged in SEQADV if r[0].get_id()[1] in shifted_mutations: pdb_num_dict[r[0].get_id()[1]][1] = resis.get( sequence_number=shifted_mutations[ r[0].get_id()[1]][2]) # Adjust for shift else: pdb_num_dict[r[0].get_id() [1]][1] = pdb_wt_dict[r[0]] # Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera # elif sc.structure.pdb_code.index=='6WHA': # ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMY--SHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' # temp_seq = '----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV' # for i, ref, temp in zip(range(0,len(ref_seq)), ref_seq, temp_seq): # print(i, ref, temp) # pprint.pprint(pdb_num_dict) bulked_residues = [] for key, val in pdb_num_dict.items(): # print(key, val) # sanity check if not isinstance(val[1], int): res_obj = Residue() res_obj.sequence_number = val[0].get_id()[1] res_obj.amino_acid = AA[val[0].get_resname()] res_obj.display_generic_number = val[ 1].display_generic_number res_obj.generic_number = val[1].generic_number res_obj.protein_conformation = alpha_protconf res_obj.protein_segment = val[1].protein_segment bulked_residues.append(res_obj) else: self.logger.info( 'Skipped {} as no annotation was present, while building for alpha subunit of {}' .format(val[1], sc)) Residue.objects.bulk_create(bulked_residues) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished' .format(sc)) except Exception as msg: #print('Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'.format(sc)) #print(msg) #print(traceback.format_exc()) #exit(0) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed' .format(sc)) ### Build SignprotStructure objects from non-complex signprots g_prot_alphas = Protein.objects.filter( family__slug__startswith='100_001', accession__isnull=False) #.filter(entry_name='gnai1_human') complex_structures = SignprotComplex.objects.all().values_list( 'structure__pdb_code__index', flat=True) for a in g_prot_alphas: pdb_list = get_pdb_ids(a.accession) for pdb in pdb_list: if pdb not in complex_structures: try: data = self.fetch_gprot_data(pdb, a) if data: self.build_g_prot_struct(a, pdb, data) except Exception as msg: self.logger.error( 'SignprotStructure of {} {} failed\n{}: {}'.format( a.entry_name, pdb, type(msg), msg))
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] # parse files for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error('Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species', 'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error('Parent protein {} for construct {} not found, aborting!'.format( sd['protein'], sd['name'])) continue # sequence type try: sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod', defaults={'name': 'Modified'}) if created: self.logger.info('Created sequence type {}'.format(sequence_type)) except IntegrityError: sequence_type = ProteinSequenceType.objects.get(slug='mod') # protein source try: protein_source, created = ProteinSource.objects.get_or_create(name='OTHER') if created: self.logger.info('Created protein source {}'.format(protein_source)) except IntegrityError: protein_source = ProteinSource.objects.get(name='OTHER') # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type= sequence_type p.source = protein_source p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info('Created construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) except: self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name)) except: self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name, p.entry_name)) # process deletions (save in db, and for sequence processing) deletions = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0],t[1]+1)) deletion = ConstructDeletion.objects.create(construct=pc, start=t[0], end=t[1]) if created: self.logger.info('Created deletion {}-{} for {}'.format(t[0], t[1], pc.protein.entry_name)) # process mutations (save in db, and for sequence processing) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = int(m[1:-1]) mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } mutation = ConstructMutation.objects.get_or_create( construct=pc, sequence_number=res_num, wild_type_amino_acid=m[0], mutated_amino_acid=m[-1], ) # insertions split_segments = {} if 'insertions' in sd and sd['insertions']: for ins in sd['insertions']: ins_start = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][0]) ins_end = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][1]) # if the insertion is within only one segment (the usual case), split that # segment into two segments if ins_start and ins_start.protein_segment == ins_end.protein_segment: # get/create split protein segments slug_1 = ins_start.protein_segment.slug + "_1" try: segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_before)) except IntegrityError: segment_before = ProteinSegment.objects.get(slug=slug_1) slug_2 = ins_start.protein_segment.slug + "_2" try: segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_after)) except IntegrityError: segment_after = ProteinSegment.objects.get(slug=slug_2) # keep track of information about split segments split_segments[ins_start.protein_segment.slug] = { 'start': { 'sequence_number': ins['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': ins['positions'][1], 'segment': segment_after, }, } # if the insertion covers two segments, use those two as the segments before and after elif ins_start: segment_before = ins_start.protein_segment segment_after = ins_end.protein_segment # if the insertion replaces a part of the sequence, add that range as a deletion if ins['positions'][1] > (ins['positions'][0] + 1): deletions += list(range((ins['positions'][0] + 1), ins['positions'][1])) # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={ 'sequence': ins['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) # create expression records if 'expression_sys' in sd and sd['expression_sys']: ce = Expression() ce.construct = pc ce.expression_system, created = ExpressionSystem.objects.get_or_create( expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell']) if 'remarks' in sd: ce.remarks = sd['expression_sys']['remarks'] ce.save() # create solubilization records if ('solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']): so = Solubilization() so.construct = pc cl = ChemicalList.objects.create() so.chemical_list = cl for step in sd['solubilization']['steps']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Solubilization step incorrectly defined for {}'.format(p)) if 'remarks' in sd['solubilization']: so.remarks = sd['solubilization']['remarks'] so.save() # create purification records if 'purification' in sd and sd['purification'] and sd['purification']['steps']: pu = Purification() pu.construct = pc if 'remarks' in sd['purification']: pu.remarks = sd['purification']['remarks'] pu.save() for step in sd['purification']['steps']: if 'type' in step and 'description' in step: pust = PurificationStep() pust.description = step['description'] pust.purification = pu pust.purification_type, created = PurificationStepType.objects.get_or_create( name = step['type'] ) # 2 values returned by get_or_create if created: self.logger.info('Created purification step type {}'.format( pust.purification_type)) pust.save() else: self.logger.error('Purification step incorrectly defined for {}'.format(p)) # create crystallization records if 'crystallization' in sd and sd['crystallization']: cy = Crystallization() cy.construct = pc cyt = CrystallizationMethodTypes.objects.create() cy.crystal_type = cyt cy.method = sd['crystallization']['method'] cy.settings = sd['crystallization']['settings'] cy.protein_conc = sd['crystallization']['protein_conc'] cl = ChemicalList.objects.create() cy.chemical_list = cl for step in sd['crystallization']['chemicallist']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Crystallization step incorrectly defined for {}'.format(p)) cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP'] cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume'] cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume'] cy.temp = sd['crystallization']['temperature'] cy.ph = sd['crystallization']['ph'] if 'remarks' in sd['crystallization']: cy.remarks = sd['crystallization']['remarks'] cy.save() # create residues prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number]['wt_res'] == pr.amino_acid: r.amino_acid = mutations[r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save()
def main_func(self, positions, iteration, count, lock): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] # parse files for source_file in filenames: source_file_path = os.sep.join( [self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f, Loader=yaml.FullLoader) # is a protein specified? if 'protein' not in sd: self.logger.error( 'Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.prefetch_related( 'protein__family', 'protein__species', 'protein__residue_numbering_scheme').get( protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found print( 'Parent protein {} for construct {} not found, aborting!' .format(sd['protein'], sd['name'])) self.logger.error( 'Parent protein {} for construct {} not found, aborting!' .format(sd['protein'], sd['name'])) continue # sequence type try: sequence_type, created = ProteinSequenceType.objects.get_or_create( slug='mod', defaults={'name': 'Modified'}) if created: self.logger.info('Created sequence type {}'.format( sequence_type)) except IntegrityError: sequence_type = ProteinSequenceType.objects.get( slug='mod') # protein source try: protein_source, created = ProteinSource.objects.get_or_create( name='OTHER') if created: self.logger.info( 'Created protein source {}'.format( protein_source)) except IntegrityError: protein_source = ProteinSource.objects.get( name='OTHER') if not Protein.objects.filter(name=sd['name']).exists(): # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type = sequence_type p.source = protein_source p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info( 'Created construct {} with parent protein {}'. format(p.name, ppc.protein.entry_name)) except: self.logger.error( 'Failed creating construct {} with parent protein {}' .format(p.name, ppc.protein.entry_name)) continue else: p = Protein.objects.get(name=sd['name']) if not ProteinConformation.objects.filter( protein=p).exists(): # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get( slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info( 'Created conformation {} of protein {}'.format( pc.state.name, p.name)) except: print( 'Failed creating conformation {} of protein {}' .format(pc.state.name, p.entry_name)) self.logger.error( 'Failed creating conformation {} of protein {}' .format(pc.state.name, p.entry_name))
def handle(self, *args, **options): startTime = datetime.datetime.now() self.options = options if self.options["purge"]: Residue.objects.filter( protein_conformation__protein__entry_name__endswith="_a", protein_conformation__protein__family__parent__parent__name= "Alpha").delete() ProteinConformation.objects.filter( protein__entry_name__endswith="_a", protein__family__parent__parent__name="Alpha").delete() Protein.objects.filter( entry_name__endswith="_a", family__parent__parent__name="Alpha").delete() SignprotStructureExtraProteins.objects.all().delete() SignprotStructure.objects.all().delete() if not options["only_signprot_structures"]: # Building protein and protconf objects for g protein structure in complex if options["s"]: scs = SignprotComplex.objects.filter( structure__pdb_code__index__in=[ i.upper() for i in options["s"] ]) else: scs = SignprotComplex.objects.all() for sc in scs: self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} is building" .format(sc)) try: # Alpha subunit try: alpha_protein = Protein.objects.get( entry_name=sc.structure.pdb_code.index.lower() + "_a") except: alpha_protein = Protein() alpha_protein.entry_name = sc.structure.pdb_code.index.lower( ) + "_a" alpha_protein.accession = None alpha_protein.name = sc.structure.pdb_code.index.lower( ) + "_a" alpha_protein.sequence = sc.protein.sequence alpha_protein.family = sc.protein.family alpha_protein.parent = sc.protein alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme alpha_protein.sequence_type = ProteinSequenceType.objects.get( slug="mod") alpha_protein.source = ProteinSource.objects.get( name="OTHER") alpha_protein.species = sc.protein.species alpha_protein.save() try: alpha_protconf = ProteinConformation.objects.get( protein__entry_name=sc.structure.pdb_code.index. lower() + "_a") except: alpha_protconf = ProteinConformation() alpha_protconf.protein = alpha_protein alpha_protconf.state = ProteinState.objects.get( slug="active") alpha_protconf.save() pdbp = PDBParser(PERMISSIVE=True, QUIET=True) s = pdbp.get_structure("struct", StringIO(sc.structure.pdb_data.pdb)) chain = s[0][sc.alpha] nums = [] for res in chain: if "CA" in res and res.id[0] == " ": nums.append(res.get_id()[1]) resis = Residue.objects.filter( protein_conformation__protein=sc.protein) num_i = 0 temp_seq2 = "" pdb_num_dict = OrderedDict() # Create first alignment based on sequence numbers for n in nums: if sc.structure.pdb_code.index == "6OIJ" and n < 30: nr = n + 6 else: nr = n pdb_num_dict[n] = [ chain[n], resis.get(sequence_number=nr) ] # Find mismatches mismatches = [] for n, res in pdb_num_dict.items(): if AA[res[0].get_resname()] != res[1].amino_acid: mismatches.append(res) pdb_lines = sc.structure.pdb_data.pdb.split("\n") seqadv = [] for l in pdb_lines: if l.startswith("SEQADV"): seqadv.append(l) mutations, shifted_mutations = OrderedDict(), OrderedDict() # Search for annotated engineered mutations in pdb SEQADV for s in seqadv: line_search = re.search( "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)", s) if line_search != None: if line_search.group(2) == sc.alpha: if line_search.group( 4).strip() == sc.protein.accession: if line_search.group( 3) == line_search.group(6): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] else: shifted_mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5), int(line_search.group(6)) ] else: # Exception for 6G79 if line_search.group( 3 ) != line_search.group( 6 ) and "CONFLICT" in line_search.group(7): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] # Exception for 5G53 if line_search.group( 4).strip() != sc.protein.accession: mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] remaining_mismatches = [] # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation for m in mismatches: num = m[0].get_id()[1] if num in mutations: if m[0].get_resname() != mutations[num][0] and m[ 1].amino_acid != AA[mutations[num][1]]: remaining_mismatches.append(m) elif num in shifted_mutations: remaining_mismatches.append(m) else: remaining_mismatches.append(m) if options["debug"]: print(sc) print(mutations) print(shifted_mutations) print(mismatches) print("======") print(remaining_mismatches) pprint.pprint(pdb_num_dict) no_seqnum_shift = [ '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U', '7L1V' ] # Check if HN is mutated to GNAI1 for the scFv16 stabilizer if sc.protein.entry_name != 'gnai1_human' and len( remaining_mismatches) > 0: target_HN = resis.filter(protein_segment__slug='HN') gnai1_HN = Residue.objects.filter( protein_conformation__protein__entry_name= 'gnai1_human', protein_segment__slug='HN') pdb_HN_seq = '' for num, val in pdb_num_dict.items(): if num <= target_HN.reverse()[0].sequence_number: pdb_HN_seq += Polypeptide.three_to_one( val[0].get_resname()) if options['debug']: print('Checking if HN is gnai1_human') print(pdb_HN_seq) print(''.join( gnai1_HN.values_list('amino_acid', flat=True))) gnai1_HN_seq = ''.join( gnai1_HN.values_list('amino_acid', flat=True)) pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq, 3, -4, -3, -1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) length, match = 0, 0 for r, t in zip(ref_seq, temp_seq): if options['debug']: print(r, t) if t != '-': if r == t: match += 1 length += 1 identity = match / length * 100 if options['debug']: print(identity) if identity > 85: if sc.structure.pdb_code.index not in ['7DFL']: no_seqnum_shift.append( sc.structure.pdb_code.index) if options['debug']: print( 'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction' .format(round(identity))) # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment if len( remaining_mismatches ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift: ppb = PPBuilder() seq = "" for pp in ppb.build_peptides(chain, aa_only=False): seq += str(pp.get_sequence()) if sc.structure.pdb_code.index in [ '7JVQ', '7L1U', '7L1V' ]: pw2 = pairwise2.align.localms( sc.protein.sequence, seq, 3, -4, -3, -1) else: pw2 = pairwise2.align.localms( sc.protein.sequence, seq, 2, -1, -.5, -.1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) # Custom fix for A->G mutation at pos 18 if sc.structure.pdb_code.index == '7JJO': ref_seq = ref_seq[:18] + ref_seq[19:] temp_seq = temp_seq[:17] + temp_seq[18:] # Custom alignment fixes elif sc.structure.pdb_code.index == '7DFL': ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' elif sc.structure.pdb_code.index == '7JOZ': temp_seq = temp_seq[:67] + ( '-' * 14) + 'FNGDS' + temp_seq[86:] elif sc.structure.pdb_code.index == '7AUE': ref_seq = ref_seq[:31].replace('-', '') + ref_seq[31:] temp_seq = ( 9 * '-') + temp_seq[2:5] + temp_seq[5:54].replace( '-', '') + temp_seq[54:] wt_pdb_dict = OrderedDict() pdb_wt_dict = OrderedDict() j, k = 0, 0 for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq, temp_seq): if options["debug"]: print(i, ref, temp) # alignment check if ref != "-" and temp != "-": wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]] [0]] = resis[j] j += 1 k += 1 elif ref == "-": wt_pdb_dict[i] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i k += 1 elif temp == "-": wt_pdb_dict[resis[j]] = i pdb_wt_dict[i] = resis[j] j += 1 # Custom fix for 7JJO isoform difference if sc.structure.pdb_code.index in [ '7JJO', '7JOZ', '7AUE' ]: pdb_num_dict = OrderedDict() for wt_res, st_res in wt_pdb_dict.items(): if type(st_res) == type([]): pdb_num_dict[wt_res.sequence_number] = [ st_res[0], wt_res ] else: for i, r in enumerate(remaining_mismatches): # Adjust for shifted residue when residue is a match if r[0].get_id()[1] - remaining_mismatches[ i - 1][0].get_id()[1] > 1: pdb_num_dict[r[0].get_id()[1] - 1][1] = pdb_wt_dict[chain[ r[0].get_id()[1] - 1]] # Adjust for shifted residue when residue is mutated and it's logged in SEQADV if r[0].get_id()[1] in shifted_mutations: pdb_num_dict[ r[0].get_id()[1]][1] = resis.get( sequence_number=shifted_mutations[ r[0].get_id()[1]][2]) # Adjust for shift else: pdb_num_dict[r[0].get_id() [1]][1] = pdb_wt_dict[r[0]] if sc.structure.pdb_code.index == '7JVQ': pdb_num_dict[198][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=346) pdb_num_dict[235][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=383) elif sc.structure.pdb_code.index == '6PB0': pdb_num_dict[205][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=205) ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera elif sc.structure.pdb_code.index == "6WHA": ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV" temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV" pdb_num_dict = OrderedDict() temp_resis = [res for res in chain] temp_i = 0 mapped_cgns = [] for i, aa in enumerate(temp_seq): if aa != "-": ref_split_on_gaps = ref_seq[:i + 1].split("-") ref_seqnum = i - (len(ref_split_on_gaps) - 1) + 1 res = resis.get(sequence_number=ref_seqnum) if res.display_generic_number.label in mapped_cgns: next_presumed_cgn = self.get_next_presumed_cgn( res) if next_presumed_cgn: res = next_presumed_cgn while res and res.display_generic_number.label in mapped_cgns: res = self.get_next_presumed_cgn( res) else: print( "Error: {} CGN does not exist. Incorrect mapping of {} in {}" .format(next_presumed_cgn, chain[nums[temp_i]], sc.structure)) mapped_cgns.append( res.display_generic_number.label) pdb_num_dict[nums[temp_i]] = [ chain[nums[temp_i]], res ] temp_i += 1 bulked_rotamers = [] for key, val in pdb_num_dict.items(): # print(key, val) # sanity check if not isinstance(val[1], int): res_obj = Residue() res_obj.sequence_number = val[0].get_id()[1] res_obj.amino_acid = AA[val[0].get_resname()] res_obj.display_generic_number = val[ 1].display_generic_number res_obj.generic_number = val[1].generic_number res_obj.protein_conformation = alpha_protconf res_obj.protein_segment = val[1].protein_segment res_obj.save() rot = self.create_structure_rotamer( val[0], res_obj, sc.structure) bulked_rotamers.append(rot) else: self.logger.info( "Skipped {} as no annotation was present, while building for alpha subunit of {}" .format(val[1], sc)) if options["debug"]: pprint.pprint(pdb_num_dict) Rotamer.objects.bulk_create(bulked_rotamers) self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished" .format(sc)) except Exception as msg: if options["debug"]: print("Error: ", sc, msg) self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed" .format(sc)) if not options["s"]: ### Build SignprotStructure objects from non-complex signprots g_prot_alphas = Protein.objects.filter( family__slug__startswith="100_001", accession__isnull=False) #.filter(entry_name="gnai1_human") complex_structures = SignprotComplex.objects.all().values_list( "structure__pdb_code__index", flat=True) for a in g_prot_alphas: pdb_list = get_pdb_ids(a.accession) for pdb in pdb_list: if pdb not in complex_structures: try: data = self.fetch_gprot_data(pdb, a) if data: self.build_g_prot_struct(a, pdb, data) except Exception as msg: self.logger.error( "SignprotStructure of {} {} failed\n{}: {}". format(a.entry_name, pdb, type(msg), msg)) if options["debug"]: print(datetime.datetime.now() - startTime)
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] # parse files for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error('Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species', 'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error('Parent protein {} for construct {} not found, aborting!'.format( sd['protein'], sd['name'])) continue # sequence type try: sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod', defaults={'name': 'Modified'}) if created: self.logger.info('Created sequence type {}'.format(sequence_type)) except IntegrityError: sequence_type = ProteinSequenceType.objects.get(slug='mod') # protein source try: protein_source, created = ProteinSource.objects.get_or_create(name='OTHER') if created: self.logger.info('Created protein source {}'.format(protein_source)) except IntegrityError: protein_source = ProteinSource.objects.get(name='OTHER') # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type= sequence_type p.source = protein_source p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info('Created construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) except: self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name)) except: self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name, p.entry_name)) # create residue records deletions = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0],t[1]+1)) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = int(m[1:-1]) mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } # insertions split_segments = {} if 'insertions' in sd and sd['insertions']: for ins in sd['insertions']: ins_start = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][0]) ins_end = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][1]) # if the insertion is within only one segment (the usual case), split that # segment into two segments if ins_start and ins_start.protein_segment == ins_end.protein_segment: # get/create split protein segments slug_1 = ins_start.protein_segment.slug + "_1" try: segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_before)) except IntegrityError: segment_before = ProteinSegment.objects.get(slug=slug_1) slug_2 = ins_start.protein_segment.slug + "_2" try: segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_after)) except IntegrityError: segment_after = ProteinSegment.objects.get(slug=slug_2) # keep track of information about split segments split_segments[ins_start.protein_segment.slug] = { 'start': { 'sequence_number': ins['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': ins['positions'][1], 'segment': segment_after, }, } # if the insertion covers two segments, use those two as the segments before and after elif ins_start: segment_before = ins_start.protein_segment segment_after = ins_end.protein_segment # if the insertion replaces a part of the sequence, add that range as a deletion if ins['positions'][1] > (ins['positions'][0] + 1): deletions += list(range((ins['positions'][0] + 1), ins['positions'][1])) # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={ 'sequence': ins['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number]['wt_res'] == pr.amino_acid: r.amino_acid = mutations[r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save()
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] # parse files for source_file in filenames: source_file_path = os.sep.join( [self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error( 'Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.prefetch_related( 'protein__family', 'protein__species', 'protein__residue_numbering_scheme').get( protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error( 'Parent protein {} for construct {} not found, aborting!' .format(sd['protein'], sd['name'])) continue # sequence type try: sequence_type, created = ProteinSequenceType.objects.get_or_create( slug='mod', defaults={'name': 'Modified'}) if created: self.logger.info('Created sequence type {}'.format( sequence_type)) except IntegrityError: sequence_type = ProteinSequenceType.objects.get( slug='mod') # protein source try: protein_source, created = ProteinSource.objects.get_or_create( name='OTHER') if created: self.logger.info( 'Created protein source {}'.format( protein_source)) except IntegrityError: protein_source = ProteinSource.objects.get( name='OTHER') # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type = sequence_type p.source = protein_source p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info( 'Created construct {} with parent protein {}'. format(p.name, ppc.protein.entry_name)) except: self.logger.error( 'Failed creating construct {} with parent protein {}' .format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get( slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info( 'Created conformation {} of protein {}'.format( pc.state.name, p.name)) except: self.logger.error( 'Failed creating conformation {} of protein {}'. format(pc.state.name, p.entry_name)) # create residue records deletions = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0], t[1] + 1)) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = int(m[1:-1]) mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } # insertions split_segments = {} if 'insertions' in sd and sd['insertions']: for ins in sd['insertions']: ins_start = Residue.objects.get( protein_conformation=ppc, sequence_number=ins['positions'][0]) ins_end = Residue.objects.get( protein_conformation=ppc, sequence_number=ins['positions'][1]) # if the insertion is within only one segment (the usual case), split that # segment into two segments if ins_start and ins_start.protein_segment == ins_end.protein_segment: # get/create split protein segments slug_1 = ins_start.protein_segment.slug + "_1" try: segment_before, created = ProteinSegment.objects.get_or_create( slug=slug_1, defaults={ 'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True }) if created: self.logger.info( 'Created protein segment {}'. format(segment_before)) except IntegrityError: segment_before = ProteinSegment.objects.get( slug=slug_1) slug_2 = ins_start.protein_segment.slug + "_2" try: segment_after, created = ProteinSegment.objects.get_or_create( slug=slug_2, defaults={ 'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True }) if created: self.logger.info( 'Created protein segment {}'. format(segment_after)) except IntegrityError: segment_after = ProteinSegment.objects.get( slug=slug_2) # keep track of information about split segments split_segments[ ins_start.protein_segment.slug] = { 'start': { 'sequence_number': ins['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': ins['positions'][1], 'segment': segment_after, }, } # if the insertion covers two segments, use those two as the segments before and after elif ins_start: segment_before = ins_start.protein_segment segment_after = ins_end.protein_segment # if the insertion replaces a part of the sequence, add that range as a deletion if ins['positions'][1] > (ins['positions'][0] + 1): deletions += list( range((ins['positions'][0] + 1), ins['positions'][1])) # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create( name=ins['name'], defaults={'sequence': ins['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create( protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) prs = Residue.objects.filter( protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug][ 'start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug][ 'end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[ pr.protein_segment. slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[ pr.protein_segment. slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number][ 'wt_res'] == pr.amino_acid: r.amino_acid = mutations[ r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save()