def create_or_update_residue(protein_conformation, segment, schemes, residue, b_and_c): logger = logging.getLogger('build') rns_defaults = { 'protein_segment': segment } # default numbering scheme for creating generic numbers # default numbering scheme ns = settings.DEFAULT_NUMBERING_SCHEME ns_obj = ResidueNumberingScheme.objects.get(slug=ns) rvalues = {} rvalues['protein_segment'] = segment rvalues['amino_acid'] = residue['aa'] rvalues['generic_number'] = None rvalues['display_generic_number'] = None sequence_number = residue['pos'] numbers = residue['numbers'] if 'generic_number' in numbers: numbers = format_generic_numbers( protein_conformation.protein.residue_numbering_scheme, schemes, sequence_number, numbers['generic_number'], numbers['bw'], b_and_c) # print(numbers) # print(residues,numbers) # main generic number if 'generic_number' in numbers: gnl = numbers['generic_number'] if gnl in schemes[ns]['generic_numbers']: rvalues['generic_number'] = schemes[ns]['generic_numbers'][gnl] else: try: gn, created = ResidueGenericNumber.objects.get_or_create( scheme=ns_obj, label=gnl, defaults=rns_defaults) # if created: # logger.info('Created generic number {}'.format(gn.label)) except IntegrityError: gn = ResidueGenericNumber.objects.get(scheme=ns_obj, label=gnl) rvalues['generic_number'] = schemes[ns]['generic_numbers'][ gnl] = gn # equivalent to main generic number if 'equivalent' in numbers: try: gn_equivalent, created = ResidueGenericNumberEquivalent.objects.get_or_create( default_generic_number=rvalues['generic_number'], scheme=protein_conformation.protein.residue_numbering_scheme, defaults={'label': numbers['equivalent']}) # if created: # logger.info('Created generic number equivalent {} ({}) for scheme {}'.format( # numbers['equivalent'], numbers['generic_number'], # protein_conformation.protein.residue_numbering_scheme)) except IntegrityError: gn_equivalent = ResidueGenericNumberEquivalent.objects.get( default_generic_number=rvalues['generic_number'], scheme=protein_conformation.protein.residue_numbering_scheme) # display generic number if 'display_generic_number' in numbers: ns = protein_conformation.protein.residue_numbering_scheme.slug gnl = numbers['display_generic_number'] if gnl in schemes[ns]['generic_numbers']: rvalues['display_generic_number'] = schemes[ns]['generic_numbers'][ gnl] else: try: gn, created = ResidueGenericNumber.objects.get_or_create( scheme=protein_conformation.protein. residue_numbering_scheme, label=gnl, defaults=rns_defaults) # if created: # logger.info('Created display generic number {}'.format(gn.label)) except IntegrityError: gn = ResidueGenericNumber.objects.get( scheme=protein_conformation.protein. residue_numbering_scheme, label=gnl) rvalues['display_generic_number'] = schemes[ns]['generic_numbers'][ gnl] = gn # UPDATE or CREATE the residue # bulk_r = Residue(protein_conformation=protein_conformation,sequence_number=sequence_number,defaults = rvalues) bulk_r = Residue(protein_conformation=protein_conformation, sequence_number=sequence_number, amino_acid=rvalues['amino_acid'], display_generic_number=rvalues['display_generic_number'], generic_number=rvalues['generic_number'], protein_segment=segment) # r, created = Residue.objects.update_or_create(protein_conformation=protein_conformation, # sequence_number=sequence_number, defaults = rvalues) # alternative generic numbers # r.alternative_generic_numbers.clear() # remove any existing relations bulk_add_alt = [] if (numbers and 'alternative_generic_numbers' in numbers): for alt_scheme, alt_num in numbers[ 'alternative_generic_numbers'].items(): if alt_num in schemes[alt_scheme]['generic_numbers']: argn = schemes[alt_scheme]['generic_numbers'][alt_num] else: try: argn, created = ResidueGenericNumber.objects.get_or_create( scheme=ResidueNumberingScheme.objects.get( slug=alt_scheme), label=alt_num, defaults=rns_defaults) except IntegrityError: argn = ResidueGenericNumber.objects.get( scheme=ResidueNumberingScheme.objects.get( slug=alt_scheme), label=alt_num) schemes[alt_scheme]['generic_numbers'][alt_num] = argn try: bulk_add_alt.append(argn) # r.alternative_generic_numbers.add(argn) except IntegrityError: print('argn already added?') pass # print('argn already added?') return [bulk_r, bulk_add_alt]
def handle(self, *args, **options): self.options = options if self.options['purge']: Residue.objects.filter( protein_conformation__protein__entry_name__endswith='_a', protein_conformation__protein__family__parent__parent__name= 'Alpha').delete() ProteinConformation.objects.filter( protein__entry_name__endswith='_a', protein__family__parent__parent__name='Alpha').delete() Protein.objects.filter( entry_name__endswith='_a', family__parent__parent__name='Alpha').delete() # Building protein and protconf objects for g protein structure in complex scs = SignprotComplex.objects.all() for sc in scs: self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is building' .format(sc)) try: # Alpha subunit try: alpha_protein = Protein.objects.get( entry_name=sc.structure.pdb_code.index.lower() + '_a') except: alpha_protein = Protein() alpha_protein.entry_name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.accession = None alpha_protein.name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.sequence = sc.protein.sequence alpha_protein.family = sc.protein.family alpha_protein.parent = sc.protein alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme alpha_protein.sequence_type = ProteinSequenceType.objects.get( slug='mod') alpha_protein.source = ProteinSource.objects.get( name='OTHER') alpha_protein.species = sc.protein.species alpha_protein.save() try: alpha_protconf = ProteinConformation.objects.get( protein__entry_name=sc.structure.pdb_code.index.lower( ) + '_a') except: alpha_protconf = ProteinConformation() alpha_protconf.protein = alpha_protein alpha_protconf.state = ProteinState.objects.get( slug='active') alpha_protconf.save() pdbp = PDBParser(PERMISSIVE=True, QUIET=True) s = pdbp.get_structure('struct', StringIO(sc.structure.pdb_data.pdb)) chain = s[0][sc.alpha] nums = [] for res in chain: try: res['CA'] nums.append(res.get_id()[1]) except: pass resis = Residue.objects.filter( protein_conformation__protein=sc.protein) num_i = 0 temp_seq2 = '' pdb_num_dict = OrderedDict() # Create first alignment based on sequence numbers for n in nums: if sc.structure.pdb_code.index == '6OIJ' and n < 30: nr = n + 6 else: nr = n pdb_num_dict[n] = [chain[n], resis.get(sequence_number=nr)] # Find mismatches mismatches = [] for n, res in pdb_num_dict.items(): if AA[res[0].get_resname()] != res[1].amino_acid: mismatches.append(res) pdb_lines = sc.structure.pdb_data.pdb.split('\n') seqadv = [] for l in pdb_lines: if l.startswith('SEQADV'): seqadv.append(l) mutations, shifted_mutations = OrderedDict(), OrderedDict() # Search for annotated engineered mutations in pdb SEQADV for s in seqadv: line_search = re.search( 'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)', s) if line_search != None: if line_search.group(2) == sc.alpha: if line_search.group( 4).strip() == sc.protein.accession: if line_search.group(3) == line_search.group( 6): mutations[int(line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] else: shifted_mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5), int(line_search.group(6)) ] else: # Exception for 6G79 if line_search.group(3) != line_search.group( 6) and 'CONFLICT' in line_search.group( 7): mutations[int(line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] # Exception for 5G53 if line_search.group( 4).strip() != sc.protein.accession: mutations[int(line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] remaining_mismatches = [] # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation for m in mismatches: num = m[0].get_id()[1] if num in mutations: if m[0].get_resname() != mutations[num][0] and m[ 1].amino_acid != AA[mutations[num][1]]: remaining_mismatches.append(m) elif num in shifted_mutations: remaining_mismatches.append(m) else: remaining_mismatches.append(m) ### sanity check # print(mutations) # print(shifted_mutations) # print(mismatches) # print(remaining_mismatches) # pprint.pprint(pdb_num_dict) # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment if len(remaining_mismatches ) > 0 and sc.structure.pdb_code.index not in [ '6OIJ', '6OY9', '6OYA' ]: ppb = PPBuilder() seq = '' for pp in ppb.build_peptides(chain, aa_only=False): seq += str(pp.get_sequence()) pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2, -1, -.5, -.1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) wt_pdb_dict = OrderedDict() pdb_wt_dict = OrderedDict() j, k = 0, 0 for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq, temp_seq): if ref != '-' and temp != '-': wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = resis[j] j += 1 k += 1 elif ref == '-': wt_pdb_dict[i] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i k += 1 elif temp == '-': wt_pdb_dict[resis[j]] = i pdb_wt_dict[i] = resis[j] j += 1 for i, r in enumerate(remaining_mismatches): # Adjust for shifted residue when residue is a match if r[0].get_id()[1] - remaining_mismatches[ i - 1][0].get_id()[1] > 1: pdb_num_dict[r[0].get_id()[1] - 1][1] = pdb_wt_dict[chain[ r[0].get_id()[1] - 1]] # Adjust for shifted residue when residue is mutated and it's logged in SEQADV if r[0].get_id()[1] in shifted_mutations: pdb_num_dict[r[0].get_id()[1]][1] = resis.get( sequence_number=shifted_mutations[ r[0].get_id()[1]][2]) # Adjust for shift else: pdb_num_dict[r[0].get_id()[1]][1] = pdb_wt_dict[ r[0]] bulked_residues = [] for key, val in pdb_num_dict.items(): # print(key, val) # sanity check res_obj = Residue() res_obj.sequence_number = val[0].get_id()[1] res_obj.amino_acid = AA[val[0].get_resname()] res_obj.display_generic_number = val[ 1].display_generic_number res_obj.generic_number = val[1].generic_number res_obj.protein_conformation = alpha_protconf res_obj.protein_segment = val[1].protein_segment bulked_residues.append(res_obj) Residue.objects.bulk_create(bulked_residues) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished' .format(sc)) except Exception as msg: print( 'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed' .format(sc)) print(msg) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed' .format(sc))
def create_constructs(self, filenames): self.logger.info('CREATING CONSTRUCTS') # what files should be parsed? if not filenames: filenames = os.listdir(self.construct_data_dir) # parse files for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error('Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.select_related('protein__family', 'protein__species', 'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error('Parent protein {} for construct {} not found, aborting!'.format( sd['protein'], sd['name'])) continue # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod', defaults={'name': 'Modified'}) p.source, created = ProteinSource.objects.get_or_create(name='OTHER') p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info('Created construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) except Exception as e: print(e) self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name)) except: self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name, p.entry_name)) # create residue records deletions = [] deletions_list = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0],t[1]+1)) deletions_list.append(str(t[0])+'-'+str(t[1])) s = "," deletion_string = s.join(deletions_list) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = m[1:-1] mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } # Create construct record c = Construct() c.protein_conformation = pc c.deletions = deletion_string c.save() # Create Auxiliary proteins # if 'auxiliary_proteins' in sd and sd['auxiliary_proteins']: # ap = AuxProtein() # ap.construct = c # apct = AuxProteinType.objects.create() # ap.protein_type = apct # apct.save() # if 'remarks' in sd['auxiliary_proteins']: # ap.remarks = sd['auxiliary_proteins']['remarks'] # ap.save() # for step in sd['auxiliary_proteins']: # if 'type' in step and 'name' in step and'sequence' in step: # ap.protein_type = apct # ap.protein_type, created = AuxProteinType.objects.get_or_create() # ap.name = sd['auxiliary_proteins']['name'] # ap.uniprot_id = sd['auxiliary_proteins']['uniprot_id'] # ap.sequence = sd['auxiliary_proteins']['sequence'] #mutations if any to be included from mutation model along with reason of mutation # ap.position = sd['auxiliary_proteins']['position'] # ap.deletions = sd['auxiliary_proteins']['deletions'] # else: # self.logger.error('Auxiliary protein step incorrectly defined for {}'.format(p)) # create expression records if 'expression_sys' in sd and sd['expression_sys']: ce = ConstructExpression() ce.construct = c ce.expression_system, created = ConstructExpressionSystem.objects.get_or_create(expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell']) if 'remarks' in sd: ce.remarks = sd['expression_sys']['remarks'] ce.save() # create solubilization records if 'solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']: so = ConstructSolubilization() so.construct = c cl = ChemicalList.objects.create() so.chemical_list = cl for step in sd['solubilization']['steps']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Solubilization step incorrectly defined for {}'.format(p)) if 'remarks' in sd['solubilization']: so.remarks = sd['solubilization']['remarks'] so.save() # create purification records if 'purification' in sd and sd['purification'] and sd['purification']['steps']: pu = ConstructPurification() pu.construct = c if 'remarks' in sd['purification']: pu.remarks = sd['purification']['remarks'] pu.save() for step in sd['purification']['steps']: if 'type' in step and 'description' in step: pust = PurificationStep() pust.description = step['description'] pust.purification = pu pust.purification_type, created = PurificationStepType.objects.get_or_create(name = step['type'] ) # 2 values returned by get_or_create if created: self.logger.info('Created purification step type {}'.format(pust.purification_type)) pust.save() else: self.logger.error('Purification step incorrectly defined for {}'.format(p)) # create crystallization records if 'crystallization' in sd and sd['crystallization']: cy = ConstructCrystallization() cy.construct = c cyt = CrystallizationMethodTypes.objects.create() cy.crystal_type = cyt cy.method = sd['crystallization']['method'] cy.settings = sd['crystallization']['settings'] cy.protein_conc = sd['crystallization']['protein_conc'] cl = ChemicalList.objects.create() cy.chemical_list = cl for step in sd['crystallization']['chemicallist']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Crystallization step incorrectly defined for {}'.format(p)) cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP'] cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume'] cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume'] cy.temp = sd['crystallization']['temperature'] cy.ph = sd['crystallization']['ph'] if 'remarks' in sd['crystallization']: cy.remarks = sd['crystallization']['remarks'] cy.save() # fusion proteins split_segments = {} if 'fusion_proteins' in sd and sd['fusion_proteins']: for fp in sd['fusion_proteins']: fp_start = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][0]) fp_end = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][1]) # if the fusion protein is inserted within only one segment (the usual case), split that # segment into two segments if fp_start and fp_start.protein_segment == fp_end.protein_segment: # get/create split protein segments segment_before, created = ProteinSegment.objects.get_or_create( slug=fp_start.protein_segment.slug+"_1", defaults={ 'name': fp_start.protein_segment.name, 'category': fp_start.protein_segment.category, 'partial': True}) segment_after, created = ProteinSegment.objects.get_or_create( slug=fp_start.protein_segment.slug+"_2", defaults={ 'name': fp_start.protein_segment.name, 'category': fp_start.protein_segment.category, 'partial': True}) # keep track of information about split segments split_segments[fp_start.protein_segment.slug] = { 'start': { 'sequence_number': fp['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': fp['positions'][1], 'segment': segment_after, }, } # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create(name=fp['name'], defaults={ 'sequence': fp['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number]['wt_res'] == pr.amino_acid: r.amino_acid = mutations[r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save() self.logger.info('COMPLETED CREATING CONSTRUCTS')
def handle(self, *args, **options): self.options = options if self.options['purge']: Residue.objects.filter( protein_conformation__protein__entry_name__endswith='_a', protein_conformation__protein__family__parent__parent__name= 'Alpha').delete() ProteinConformation.objects.filter( protein__entry_name__endswith='_a', protein__family__parent__parent__name='Alpha').delete() Protein.objects.filter( entry_name__endswith='_a', family__parent__parent__name='Alpha').delete() SignprotStructureExtraProteins.objects.all().delete() SignprotStructure.objects.all().delete() if not options['only_signprot_structures']: # Building protein and protconf objects for g protein structure in complex scs = SignprotComplex.objects.all() for sc in scs: self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is building' .format(sc)) try: # Alpha subunit try: alpha_protein = Protein.objects.get( entry_name=sc.structure.pdb_code.index.lower() + '_a') except: alpha_protein = Protein() alpha_protein.entry_name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.accession = None alpha_protein.name = sc.structure.pdb_code.index.lower( ) + '_a' alpha_protein.sequence = sc.protein.sequence alpha_protein.family = sc.protein.family alpha_protein.parent = sc.protein alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme alpha_protein.sequence_type = ProteinSequenceType.objects.get( slug='mod') alpha_protein.source = ProteinSource.objects.get( name='OTHER') alpha_protein.species = sc.protein.species alpha_protein.save() try: alpha_protconf = ProteinConformation.objects.get( protein__entry_name=sc.structure.pdb_code.index. lower() + '_a') except: alpha_protconf = ProteinConformation() alpha_protconf.protein = alpha_protein alpha_protconf.state = ProteinState.objects.get( slug='active') alpha_protconf.save() pdbp = PDBParser(PERMISSIVE=True, QUIET=True) s = pdbp.get_structure('struct', StringIO(sc.structure.pdb_data.pdb)) chain = s[0][sc.alpha] nums = [] for res in chain: try: res['CA'] nums.append(res.get_id()[1]) except: pass resis = Residue.objects.filter( protein_conformation__protein=sc.protein) num_i = 0 temp_seq2 = '' pdb_num_dict = OrderedDict() # Create first alignment based on sequence numbers for n in nums: if sc.structure.pdb_code.index == '6OIJ' and n < 30: nr = n + 6 else: nr = n pdb_num_dict[n] = [ chain[n], resis.get(sequence_number=nr) ] # Find mismatches mismatches = [] for n, res in pdb_num_dict.items(): if AA[res[0].get_resname()] != res[1].amino_acid: mismatches.append(res) pdb_lines = sc.structure.pdb_data.pdb.split('\n') seqadv = [] for l in pdb_lines: if l.startswith('SEQADV'): seqadv.append(l) mutations, shifted_mutations = OrderedDict(), OrderedDict() # Search for annotated engineered mutations in pdb SEQADV for s in seqadv: line_search = re.search( 'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)', s) if line_search != None: if line_search.group(2) == sc.alpha: if line_search.group( 4).strip() == sc.protein.accession: if line_search.group( 3) == line_search.group(6): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] else: shifted_mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5), int(line_search.group(6)) ] else: # Exception for 6G79 if line_search.group( 3 ) != line_search.group( 6 ) and 'CONFLICT' in line_search.group(7): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] # Exception for 5G53 if line_search.group( 4).strip() != sc.protein.accession: mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] remaining_mismatches = [] # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation for m in mismatches: num = m[0].get_id()[1] if num in mutations: if m[0].get_resname() != mutations[num][0] and m[ 1].amino_acid != AA[mutations[num][1]]: remaining_mismatches.append(m) elif num in shifted_mutations: remaining_mismatches.append(m) else: remaining_mismatches.append(m) ### sanity check # print(sc) # print(mutations) # print(shifted_mutations) # print(mismatches) # print('======') # print(remaining_mismatches) # pprint.pprint(pdb_num_dict) # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment if len(remaining_mismatches ) > 0 and sc.structure.pdb_code.index not in [ '6OIJ', '6OY9', '6OYA', '6LPB', '6WHA' ]: ppb = PPBuilder() seq = '' for pp in ppb.build_peptides(chain, aa_only=False): seq += str(pp.get_sequence()) pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2, -1, -.5, -.1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) wt_pdb_dict = OrderedDict() pdb_wt_dict = OrderedDict() j, k = 0, 0 for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq, temp_seq): # print(i, ref, temp) # alignment check if ref != '-' and temp != '-': wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]] [0]] = resis[j] j += 1 k += 1 elif ref == '-': wt_pdb_dict[i] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i k += 1 elif temp == '-': wt_pdb_dict[resis[j]] = i pdb_wt_dict[i] = resis[j] j += 1 for i, r in enumerate(remaining_mismatches): # Adjust for shifted residue when residue is a match if r[0].get_id()[1] - remaining_mismatches[ i - 1][0].get_id()[1] > 1: pdb_num_dict[r[0].get_id()[1] - 1][1] = pdb_wt_dict[chain[ r[0].get_id()[1] - 1]] # Adjust for shifted residue when residue is mutated and it's logged in SEQADV if r[0].get_id()[1] in shifted_mutations: pdb_num_dict[r[0].get_id()[1]][1] = resis.get( sequence_number=shifted_mutations[ r[0].get_id()[1]][2]) # Adjust for shift else: pdb_num_dict[r[0].get_id() [1]][1] = pdb_wt_dict[r[0]] # Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera # elif sc.structure.pdb_code.index=='6WHA': # ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMY--SHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' # temp_seq = '----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV' # for i, ref, temp in zip(range(0,len(ref_seq)), ref_seq, temp_seq): # print(i, ref, temp) # pprint.pprint(pdb_num_dict) bulked_residues = [] for key, val in pdb_num_dict.items(): # print(key, val) # sanity check if not isinstance(val[1], int): res_obj = Residue() res_obj.sequence_number = val[0].get_id()[1] res_obj.amino_acid = AA[val[0].get_resname()] res_obj.display_generic_number = val[ 1].display_generic_number res_obj.generic_number = val[1].generic_number res_obj.protein_conformation = alpha_protconf res_obj.protein_segment = val[1].protein_segment bulked_residues.append(res_obj) else: self.logger.info( 'Skipped {} as no annotation was present, while building for alpha subunit of {}' .format(val[1], sc)) Residue.objects.bulk_create(bulked_residues) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished' .format(sc)) except Exception as msg: #print('Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'.format(sc)) #print(msg) #print(traceback.format_exc()) #exit(0) self.logger.info( 'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed' .format(sc)) ### Build SignprotStructure objects from non-complex signprots g_prot_alphas = Protein.objects.filter( family__slug__startswith='100_001', accession__isnull=False) #.filter(entry_name='gnai1_human') complex_structures = SignprotComplex.objects.all().values_list( 'structure__pdb_code__index', flat=True) for a in g_prot_alphas: pdb_list = get_pdb_ids(a.accession) for pdb in pdb_list: if pdb not in complex_structures: try: data = self.fetch_gprot_data(pdb, a) if data: self.build_g_prot_struct(a, pdb, data) except Exception as msg: self.logger.error( 'SignprotStructure of {} {} failed\n{}: {}'.format( a.entry_name, pdb, type(msg), msg))
def create_residues(self, args): schemes = { 'gpcrdb': {'type': False}, 'gpcrdba': { 'type': 'structure', 'seq_based': 'bw', }, 'gpcrdbb': { 'type': 'structure', 'seq_based': 'woot', }, 'gpcrdbc': { 'type': 'structure', 'seq_based': 'pin', }, 'gpcrdbf': { 'type': 'structure', 'seq_based': 'wang', }, 'bw': {'type': 'sequence'}, 'woot': {'type': 'sequence'}, 'pin': {'type': 'sequence'}, 'wang': {'type': 'sequence'}, } for scheme_name, scheme in schemes.items(): schemes[scheme_name]['obj'] = ResidueNumberingScheme.objects.get(slug=scheme_name) mapping_file = os.sep.join([self.generic_numbers_source_dir, 'mapping_' + scheme_name + '.txt']) if os.path.isfile(mapping_file): with open(mapping_file, "r", encoding='UTF-8') as scheme_table_file: schemes[scheme_name]['table'] = {} for row in scheme_table_file: split_row = shlex.split(row) schemes[scheme_name]['table'][split_row[0]] = split_row[1] missing_proteins = [] self.logger.info('CREATING RESIDUES') for arg in args: if os.path.exists(os.sep.join([self.dump_source_dir, arg])): residue_data_fh = open(os.sep.join([self.dump_source_dir, arg]), 'r') self.logger.info('Parsing residue data from {}'.format(arg)) else: print("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg]))) self.logger.error("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg]))) continue for line in residue_data_fh: id,res_num,res_name,oli,gpcrdb,bw,bw2,bs,prot_name,sec_str_name = [x.strip().strip('"') for x in line.split(',')] #double strip due to some weird bug... if prot_name in missing_proteins: continue # fetch schemes and conversion tables #Checking if the protein exists in the db try: pconf = ProteinConformation.objects.get(protein__entry_name=prot_name, state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist as e: missing_proteins.append(prot_name) continue #Checking if given residue already exists in the db try: Residue.objects.get(protein_conformation=pconf.id, sequence_number=res_num) continue except Residue.DoesNotExist as e: pass r = Residue() r.protein_conformation = pconf r.sequence_number = int(res_num) r.amino_acid = polypeptide.three_to_one(res_name.upper()) generic_numbers = [] try: r.save() self.logger.info('Created residue {:n}{!s} for protein {!s}'.format(r.sequence_number, r.amino_acid, pconf.protein.entry_name)) except Exception as msg: print(msg) self.logger.error('Failed to create residue {:n}{!s} for protein {!s}'.format( r.sequence_number, r.amino_acid, pconf.protein.entry_name)) continue # residue segment dump_segment = sec_str_name try: r.protein_segment = ProteinSegment.objects.get(slug=dump_segment) except: self.logger.error('Failed to fetch protein segment {}'.format(dump_segment)) # generic number if (str(oli) != '0' and gpcrdb != 'None' and bw != 'None'): # separate bulge number (1241 - > 124 + 1) bulge_prime = '' dump_oliveira = str(oli) if len(dump_oliveira) == 4: bulge_prime = dump_oliveira[3] dump_oliveira = dump_oliveira[:3] dump_gpcrdb = gpcrdb[:4] dump_seq_based = bw # default gpcrdb number def_gpcrdb = False if dump_oliveira in schemes[settings.DEFAULT_NUMBERING_SCHEME]['table']: default_label = (schemes[settings.DEFAULT_NUMBERING_SCHEME]['table'][dump_oliveira] + bulge_prime) try: def_gpcrdb = ResidueGenericNumber.objects.get(label=default_label, scheme=schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj']) except ResidueGenericNumber.DoesNotExist as e: def_gpcrdb = ResidueGenericNumber() def_gpcrdb.label = default_label def_gpcrdb.scheme = schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'] def_gpcrdb.protein_segment = r.protein_segment def_gpcrdb.save() self.logger.info('Created generic number {:s} in numbering scheme {:s}' .format(default_label, schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'].short_name)) # if default number was found/added successfully, process the alternative numbers if def_gpcrdb: # add default generic number to residue record r.generic_number = def_gpcrdb # dict of sequence-based numbers, for use in structure-based numbers (5.46x461) seq_based_labels = {} # sequence-based schemes first (the sequence-based numbers are needed for the # structure based schemes) for scheme_name, scheme in schemes.items(): if scheme['type'] == 'sequence': # is this number in the scheme defined for this protein? if scheme_name == schemes[pconf.protein.residue_numbering_scheme.slug]['seq_based']: seq_based_label = dump_seq_based # if not convert the number to the correct scheme else: slug = pconf.protein.residue_numbering_scheme.slug for d, c in schemes[schemes[slug]['seq_based']]['table'].items(): if c == dump_seq_based: seq_based_label = scheme['table'][d] break # fetch/insert the number try: seq_based = ResidueGenericNumber.objects.get(label=seq_based_label, scheme=scheme['obj']) except ResidueGenericNumber.DoesNotExist as e: seq_based = ResidueGenericNumber() seq_based.label = seq_based_label seq_based.scheme = scheme['obj'] seq_based.protein_segment = r.protein_segment seq_based.save() r.alternative_generic_numbers.add(seq_based) # add added number to the dict for later use seq_based_labels[scheme_name] = seq_based_label # structure-based numbers for scheme_name, scheme in schemes.items(): if scheme['type'] == 'structure': # is this number in the scheme defined for this protein? if scheme_name == pconf.protein.residue_numbering_scheme.slug: struct_based_label = dump_gpcrdb + bulge_prime # if not convert the number to the correct scheme else: for d, c in schemes[pconf.protein.residue_numbering_scheme.slug]['table'].items(): if c == dump_gpcrdb: struct_based_label = scheme['table'][d] + bulge_prime break # add the sequence-based label (5x461 -> 5.46x461) split_struct_based_label = struct_based_label.split('x') struct_based_label = (seq_based_labels[scheme['seq_based']] + 'x' + split_struct_based_label[1]) # fetch/insert the number try: struct_based = ResidueGenericNumber.objects.get( label=struct_based_label, scheme=scheme['obj']) except ResidueGenericNumber.DoesNotExist as e: struct_based = ResidueGenericNumber() struct_based.label = struct_based_label struct_based.scheme = scheme['obj'] struct_based.protein_segment = r.protein_segment struct_based.save() # add to residue as a display number or alternative number? if scheme_name == pconf.protein.residue_numbering_scheme.slug: r.display_generic_number = struct_based else: r.alternative_generic_numbers.add(struct_based) try: r.save() self.logger.info('Added generic numbers for residue {}{!s} for protein {!s}'.format(res_num, res_name, pconf.protein.entry_name)) except Exception as msg: print(msg) self.logger.error( 'Failed to create generic numbers for residue {}{!s} for protein {!s}'.format(res_num, res_name, pconf.protein.entry_name)) self.logger.info('COMPLETED CREATING RESIDUES')
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] # parse files for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error('Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species', 'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error('Parent protein {} for construct {} not found, aborting!'.format( sd['protein'], sd['name'])) continue # sequence type try: sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod', defaults={'name': 'Modified'}) if created: self.logger.info('Created sequence type {}'.format(sequence_type)) except IntegrityError: sequence_type = ProteinSequenceType.objects.get(slug='mod') # protein source try: protein_source, created = ProteinSource.objects.get_or_create(name='OTHER') if created: self.logger.info('Created protein source {}'.format(protein_source)) except IntegrityError: protein_source = ProteinSource.objects.get(name='OTHER') # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type= sequence_type p.source = protein_source p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info('Created construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) except: self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name)) except: self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name, p.entry_name)) # process deletions (save in db, and for sequence processing) deletions = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0],t[1]+1)) deletion = ConstructDeletion.objects.create(construct=pc, start=t[0], end=t[1]) if created: self.logger.info('Created deletion {}-{} for {}'.format(t[0], t[1], pc.protein.entry_name)) # process mutations (save in db, and for sequence processing) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = int(m[1:-1]) mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } mutation = ConstructMutation.objects.get_or_create( construct=pc, sequence_number=res_num, wild_type_amino_acid=m[0], mutated_amino_acid=m[-1], ) # insertions split_segments = {} if 'insertions' in sd and sd['insertions']: for ins in sd['insertions']: ins_start = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][0]) ins_end = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][1]) # if the insertion is within only one segment (the usual case), split that # segment into two segments if ins_start and ins_start.protein_segment == ins_end.protein_segment: # get/create split protein segments slug_1 = ins_start.protein_segment.slug + "_1" try: segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_before)) except IntegrityError: segment_before = ProteinSegment.objects.get(slug=slug_1) slug_2 = ins_start.protein_segment.slug + "_2" try: segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_after)) except IntegrityError: segment_after = ProteinSegment.objects.get(slug=slug_2) # keep track of information about split segments split_segments[ins_start.protein_segment.slug] = { 'start': { 'sequence_number': ins['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': ins['positions'][1], 'segment': segment_after, }, } # if the insertion covers two segments, use those two as the segments before and after elif ins_start: segment_before = ins_start.protein_segment segment_after = ins_end.protein_segment # if the insertion replaces a part of the sequence, add that range as a deletion if ins['positions'][1] > (ins['positions'][0] + 1): deletions += list(range((ins['positions'][0] + 1), ins['positions'][1])) # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={ 'sequence': ins['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) # create expression records if 'expression_sys' in sd and sd['expression_sys']: ce = Expression() ce.construct = pc ce.expression_system, created = ExpressionSystem.objects.get_or_create( expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell']) if 'remarks' in sd: ce.remarks = sd['expression_sys']['remarks'] ce.save() # create solubilization records if ('solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']): so = Solubilization() so.construct = pc cl = ChemicalList.objects.create() so.chemical_list = cl for step in sd['solubilization']['steps']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Solubilization step incorrectly defined for {}'.format(p)) if 'remarks' in sd['solubilization']: so.remarks = sd['solubilization']['remarks'] so.save() # create purification records if 'purification' in sd and sd['purification'] and sd['purification']['steps']: pu = Purification() pu.construct = pc if 'remarks' in sd['purification']: pu.remarks = sd['purification']['remarks'] pu.save() for step in sd['purification']['steps']: if 'type' in step and 'description' in step: pust = PurificationStep() pust.description = step['description'] pust.purification = pu pust.purification_type, created = PurificationStepType.objects.get_or_create( name = step['type'] ) # 2 values returned by get_or_create if created: self.logger.info('Created purification step type {}'.format( pust.purification_type)) pust.save() else: self.logger.error('Purification step incorrectly defined for {}'.format(p)) # create crystallization records if 'crystallization' in sd and sd['crystallization']: cy = Crystallization() cy.construct = pc cyt = CrystallizationMethodTypes.objects.create() cy.crystal_type = cyt cy.method = sd['crystallization']['method'] cy.settings = sd['crystallization']['settings'] cy.protein_conc = sd['crystallization']['protein_conc'] cl = ChemicalList.objects.create() cy.chemical_list = cl for step in sd['crystallization']['chemicallist']: if 'type' in step and 'item' in step and'concentration' in step: chem = Chemical() chem.chemical_type, created = ChemicalType.objects.get_or_create(name = step['type']) chem.name = step['item'] chem.save() cc = ChemicalConc() cc.concentration = step['concentration'] cc.chemical = chem # since ChemicalConc has a ForeignKey to Chemical cc.save() cl.chemicals.add(cc) else: self.logger.error('Crystallization step incorrectly defined for {}'.format(p)) cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP'] cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume'] cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume'] cy.temp = sd['crystallization']['temperature'] cy.ph = sd['crystallization']['ph'] if 'remarks' in sd['crystallization']: cy.remarks = sd['crystallization']['remarks'] cy.save() # create residues prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number]['wt_res'] == pr.amino_acid: r.amino_acid = mutations[r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save()
def create_rotamers(self, structure, pdb_path): wt_lookup = {} #used to match WT seq_number to WT residue record pdbseq = {} #used to keep track of pdbseq residue positions vs index in seq ref_positions = {} #WT postions in alignment mapped_seq = {} # index in contruct, tuple of AA and WT [position,AA] preferred_chain = structure.preferred_chain if len(preferred_chain.split(','))>1: #if A,B preferred_chain = preferred_chain.split(',')[0] AA = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C', 'GLN':'Q', 'GLU':'E', 'GLY':'G', 'HIS':'H', 'ILE':'I', 'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P', 'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'} s = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', pdb_path)[0] chain = s[preferred_chain] #select only one chain (avoid n-mer receptors) ppb=PPBuilder() seq = '' i = 1 check_1000 = 0 for pp in ppb.build_peptides(chain): #remove >1000 pos (fusion protein / gprotein) for res in pp: id = res.id if id[1]<600: check_1000 += 1 #need check_1000 to catch structures where they lie in 1000s (4LDE, 4LDL, 4LDO, 4N4W, 4QKX) if id[1]>1000 and check_1000>200: chain.detach_child(id) for pp in ppb.build_peptides(chain): seq += str(pp.get_sequence()) #get seq from fasta (only chain A) for residue in pp: residue_id = residue.get_full_id() chain = residue_id[2] if chain not in pdbseq: pdbseq[chain] = {} pos = residue_id[3][1] pdbseq[chain][pos] = [i,AA[residue.resname]] i += 1 parent_seq = str(structure.protein_conformation.protein.parent.sequence) rs = Residue.objects.filter(protein_conformation__protein=structure.protein_conformation.protein.parent).prefetch_related('display_generic_number','generic_number','protein_segment') for r in rs: #required to match WT position to a record (for duplication of GN values) wt_lookup[r.sequence_number] = r #align WT with structure seq -- make gaps penalties big, so to avoid too much overfitting pw2 = pairwise2.align.localms(parent_seq, seq, 2, -4, -4, -.1) gaps = 0 unmapped_ref = {} for i, r in enumerate(pw2[0][0], 1): #loop over alignment to create lookups (track pos) #print(i,r,pw2[0][1][i-1]) #print alignment for sanity check if r == "-": gaps += 1 if r != "-": ref_positions[i] = [i-gaps,r] elif r == "-": ref_positions[i] = [None,'-'] if pw2[0][1][i-1]=='-': unmapped_ref[i-gaps] = '-' gaps = 0 for i, r in enumerate(pw2[0][1], 1): #make second lookup if r == "-": gaps += 1 if r != "-": mapped_seq[i-gaps] = [r,ref_positions[i]] pdb = structure.pdb_data.pdb protein_conformation=structure.protein_conformation temp = '' check = 0 errors = 0 mismatch_seq = 0 match_seq = 0 not_matched = 0 matched_by_pos = 0 aa_mismatch = 0 pdblines_temp = pdb.splitlines() pdblines = [] for line in pdblines_temp: #Get rid of all odd records if line.startswith('ATOM'): pdblines.append(line) pdblines.append('') #add a line to not "run out" for i,line in enumerate(pdblines): if line.startswith('ATOM'): chain = line[21] if preferred_chain and chain!=preferred_chain: #If perferred is defined and is not the same as the current line, then skip pass else: nextline = pdblines[i+1] residue_number = line[22:26].strip() if (check==0 or nextline[22:26].strip()==check) and nextline.startswith('TER')==False and nextline.startswith('ATOM')==True: #If this is either the begining or the same as previous line add to current rotamer temp += line + "\n" #print('same res',pdb.splitlines()[i+1]) else: #if this is a new residue #print(pdb.splitlines()[i+1][22:26].strip(),check) temp += line + "\n" if int(check.strip())<2000: residue = Residue() residue.sequence_number = int(check.strip()) residue.amino_acid = AA[residue_name.upper()] residue.protein_conformation = protein_conformation #print(residue.sequence_number,residue.amino_acid) #sanity check try: seq_num_pos = pdbseq[chain][residue.sequence_number][0] except: #print('failed residue',pdb_path,residue.sequence_number) temp = "" #start new line for rotamer check = pdblines[i+1][22:26].strip() continue if seq_num_pos in mapped_seq: if mapped_seq[seq_num_pos][1][0]==None: #print('no match found') #sanity check #print(residue.sequence_number,residue.amino_acid) #sanity check residue.display_generic_number = None residue.generic_number = None residue.protein_segment = None not_matched +=1 else: wt_r = wt_lookup[mapped_seq[seq_num_pos][1][0]] if residue.sequence_number!=wt_r.sequence_number and residue.amino_acid!=wt_r.amino_acid and residue.sequence_number in wt_lookup: #if pos numbers not work -- see if the pos number might be in WT and unmapped if wt_lookup[residue.sequence_number].amino_acid==residue.amino_acid: if residue.sequence_number in unmapped_ref: #WT was not mapped, so could be it # print(residue.sequence_number,residue.amino_acid) #sanity check #print('wrongly matched, better match on pos+aa',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) wt_r = wt_lookup[residue.sequence_number] matched_by_pos +=1 match_seq += 1 else: mismatch_seq += 1 #print('could have been matched, but already aligned to another position',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) else: #print('WT pos not same AA, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) mismatch_seq += 1 elif residue.sequence_number!=wt_r.sequence_number: #print('WT pos not same pos, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) mismatch_seq += 1 elif residue.amino_acid!=wt_r.amino_acid: #print('aa mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) aa_mismatch += 1 else: match_seq += 1 if wt_r.generic_number is not None: residue.display_generic_number = wt_r.display_generic_number residue.generic_number = wt_r.generic_number else: residue.display_generic_number = None residue.generic_number = None #print('no GN') residue.protein_segment = wt_r.protein_segment else: #print('wierd error') #sanity check residue.display_generic_number = None residue.generic_number = None residue.protein_segment = None #print('inserted',residue.sequence_number) #sanity check residue.save() rotamer_data, created = PdbData.objects.get_or_create(pdb=temp) rotamer, created = Rotamer.objects.get_or_create(residue=residue, structure=structure, pdbdata=rotamer_data) temp = "" #start new line for rotamer check = pdblines[i+1][22:26].strip() check = pdblines[i+1][22:26].strip() chain = line[21] residue_name = line[17:20].title() #use title to get GLY to Gly so it matches #print(structure.pdb_code.index,'length',len(seq),len(mapped_seq),'mapped res',str(mismatch_seq+match_seq+aa_mismatch),'pos mismatch',mismatch_seq,'aa mismatch',aa_mismatch,'not mapped',not_matched,' mapping off, matched on pos,aa',matched_by_pos) return None
def handle(self, *args, **options): startTime = datetime.datetime.now() self.options = options if self.options["purge"]: Residue.objects.filter( protein_conformation__protein__entry_name__endswith="_a", protein_conformation__protein__family__parent__parent__name= "Alpha").delete() ProteinConformation.objects.filter( protein__entry_name__endswith="_a", protein__family__parent__parent__name="Alpha").delete() Protein.objects.filter( entry_name__endswith="_a", family__parent__parent__name="Alpha").delete() SignprotStructureExtraProteins.objects.all().delete() SignprotStructure.objects.all().delete() if not options["only_signprot_structures"]: # Building protein and protconf objects for g protein structure in complex if options["s"]: scs = SignprotComplex.objects.filter( structure__pdb_code__index__in=[ i.upper() for i in options["s"] ]) else: scs = SignprotComplex.objects.all() for sc in scs: self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} is building" .format(sc)) try: # Alpha subunit try: alpha_protein = Protein.objects.get( entry_name=sc.structure.pdb_code.index.lower() + "_a") except: alpha_protein = Protein() alpha_protein.entry_name = sc.structure.pdb_code.index.lower( ) + "_a" alpha_protein.accession = None alpha_protein.name = sc.structure.pdb_code.index.lower( ) + "_a" alpha_protein.sequence = sc.protein.sequence alpha_protein.family = sc.protein.family alpha_protein.parent = sc.protein alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme alpha_protein.sequence_type = ProteinSequenceType.objects.get( slug="mod") alpha_protein.source = ProteinSource.objects.get( name="OTHER") alpha_protein.species = sc.protein.species alpha_protein.save() try: alpha_protconf = ProteinConformation.objects.get( protein__entry_name=sc.structure.pdb_code.index. lower() + "_a") except: alpha_protconf = ProteinConformation() alpha_protconf.protein = alpha_protein alpha_protconf.state = ProteinState.objects.get( slug="active") alpha_protconf.save() pdbp = PDBParser(PERMISSIVE=True, QUIET=True) s = pdbp.get_structure("struct", StringIO(sc.structure.pdb_data.pdb)) chain = s[0][sc.alpha] nums = [] for res in chain: if "CA" in res and res.id[0] == " ": nums.append(res.get_id()[1]) resis = Residue.objects.filter( protein_conformation__protein=sc.protein) num_i = 0 temp_seq2 = "" pdb_num_dict = OrderedDict() # Create first alignment based on sequence numbers for n in nums: if sc.structure.pdb_code.index == "6OIJ" and n < 30: nr = n + 6 else: nr = n pdb_num_dict[n] = [ chain[n], resis.get(sequence_number=nr) ] # Find mismatches mismatches = [] for n, res in pdb_num_dict.items(): if AA[res[0].get_resname()] != res[1].amino_acid: mismatches.append(res) pdb_lines = sc.structure.pdb_data.pdb.split("\n") seqadv = [] for l in pdb_lines: if l.startswith("SEQADV"): seqadv.append(l) mutations, shifted_mutations = OrderedDict(), OrderedDict() # Search for annotated engineered mutations in pdb SEQADV for s in seqadv: line_search = re.search( "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)", s) if line_search != None: if line_search.group(2) == sc.alpha: if line_search.group( 4).strip() == sc.protein.accession: if line_search.group( 3) == line_search.group(6): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] else: shifted_mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5), int(line_search.group(6)) ] else: # Exception for 6G79 if line_search.group( 3 ) != line_search.group( 6 ) and "CONFLICT" in line_search.group(7): mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] # Exception for 5G53 if line_search.group( 4).strip() != sc.protein.accession: mutations[int( line_search.group(3))] = [ line_search.group(1), line_search.group(5) ] remaining_mismatches = [] # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation for m in mismatches: num = m[0].get_id()[1] if num in mutations: if m[0].get_resname() != mutations[num][0] and m[ 1].amino_acid != AA[mutations[num][1]]: remaining_mismatches.append(m) elif num in shifted_mutations: remaining_mismatches.append(m) else: remaining_mismatches.append(m) if options["debug"]: print(sc) print(mutations) print(shifted_mutations) print(mismatches) print("======") print(remaining_mismatches) pprint.pprint(pdb_num_dict) no_seqnum_shift = [ '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U', '7L1V' ] # Check if HN is mutated to GNAI1 for the scFv16 stabilizer if sc.protein.entry_name != 'gnai1_human' and len( remaining_mismatches) > 0: target_HN = resis.filter(protein_segment__slug='HN') gnai1_HN = Residue.objects.filter( protein_conformation__protein__entry_name= 'gnai1_human', protein_segment__slug='HN') pdb_HN_seq = '' for num, val in pdb_num_dict.items(): if num <= target_HN.reverse()[0].sequence_number: pdb_HN_seq += Polypeptide.three_to_one( val[0].get_resname()) if options['debug']: print('Checking if HN is gnai1_human') print(pdb_HN_seq) print(''.join( gnai1_HN.values_list('amino_acid', flat=True))) gnai1_HN_seq = ''.join( gnai1_HN.values_list('amino_acid', flat=True)) pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq, 3, -4, -3, -1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) length, match = 0, 0 for r, t in zip(ref_seq, temp_seq): if options['debug']: print(r, t) if t != '-': if r == t: match += 1 length += 1 identity = match / length * 100 if options['debug']: print(identity) if identity > 85: if sc.structure.pdb_code.index not in ['7DFL']: no_seqnum_shift.append( sc.structure.pdb_code.index) if options['debug']: print( 'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction' .format(round(identity))) # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment if len( remaining_mismatches ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift: ppb = PPBuilder() seq = "" for pp in ppb.build_peptides(chain, aa_only=False): seq += str(pp.get_sequence()) if sc.structure.pdb_code.index in [ '7JVQ', '7L1U', '7L1V' ]: pw2 = pairwise2.align.localms( sc.protein.sequence, seq, 3, -4, -3, -1) else: pw2 = pairwise2.align.localms( sc.protein.sequence, seq, 2, -1, -.5, -.1) ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1]) # Custom fix for A->G mutation at pos 18 if sc.structure.pdb_code.index == '7JJO': ref_seq = ref_seq[:18] + ref_seq[19:] temp_seq = temp_seq[:17] + temp_seq[18:] # Custom alignment fixes elif sc.structure.pdb_code.index == '7DFL': ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV' elif sc.structure.pdb_code.index == '7JOZ': temp_seq = temp_seq[:67] + ( '-' * 14) + 'FNGDS' + temp_seq[86:] elif sc.structure.pdb_code.index == '7AUE': ref_seq = ref_seq[:31].replace('-', '') + ref_seq[31:] temp_seq = ( 9 * '-') + temp_seq[2:5] + temp_seq[5:54].replace( '-', '') + temp_seq[54:] wt_pdb_dict = OrderedDict() pdb_wt_dict = OrderedDict() j, k = 0, 0 for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq, temp_seq): if options["debug"]: print(i, ref, temp) # alignment check if ref != "-" and temp != "-": wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]] [0]] = resis[j] j += 1 k += 1 elif ref == "-": wt_pdb_dict[i] = pdb_num_dict[nums[k]] pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i k += 1 elif temp == "-": wt_pdb_dict[resis[j]] = i pdb_wt_dict[i] = resis[j] j += 1 # Custom fix for 7JJO isoform difference if sc.structure.pdb_code.index in [ '7JJO', '7JOZ', '7AUE' ]: pdb_num_dict = OrderedDict() for wt_res, st_res in wt_pdb_dict.items(): if type(st_res) == type([]): pdb_num_dict[wt_res.sequence_number] = [ st_res[0], wt_res ] else: for i, r in enumerate(remaining_mismatches): # Adjust for shifted residue when residue is a match if r[0].get_id()[1] - remaining_mismatches[ i - 1][0].get_id()[1] > 1: pdb_num_dict[r[0].get_id()[1] - 1][1] = pdb_wt_dict[chain[ r[0].get_id()[1] - 1]] # Adjust for shifted residue when residue is mutated and it's logged in SEQADV if r[0].get_id()[1] in shifted_mutations: pdb_num_dict[ r[0].get_id()[1]][1] = resis.get( sequence_number=shifted_mutations[ r[0].get_id()[1]][2]) # Adjust for shift else: pdb_num_dict[r[0].get_id() [1]][1] = pdb_wt_dict[r[0]] if sc.structure.pdb_code.index == '7JVQ': pdb_num_dict[198][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=346) pdb_num_dict[235][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=383) elif sc.structure.pdb_code.index == '6PB0': pdb_num_dict[205][1] = Residue.objects.get( protein_conformation__protein=sc.protein, sequence_number=205) ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera elif sc.structure.pdb_code.index == "6WHA": ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV" temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV" pdb_num_dict = OrderedDict() temp_resis = [res for res in chain] temp_i = 0 mapped_cgns = [] for i, aa in enumerate(temp_seq): if aa != "-": ref_split_on_gaps = ref_seq[:i + 1].split("-") ref_seqnum = i - (len(ref_split_on_gaps) - 1) + 1 res = resis.get(sequence_number=ref_seqnum) if res.display_generic_number.label in mapped_cgns: next_presumed_cgn = self.get_next_presumed_cgn( res) if next_presumed_cgn: res = next_presumed_cgn while res and res.display_generic_number.label in mapped_cgns: res = self.get_next_presumed_cgn( res) else: print( "Error: {} CGN does not exist. Incorrect mapping of {} in {}" .format(next_presumed_cgn, chain[nums[temp_i]], sc.structure)) mapped_cgns.append( res.display_generic_number.label) pdb_num_dict[nums[temp_i]] = [ chain[nums[temp_i]], res ] temp_i += 1 bulked_rotamers = [] for key, val in pdb_num_dict.items(): # print(key, val) # sanity check if not isinstance(val[1], int): res_obj = Residue() res_obj.sequence_number = val[0].get_id()[1] res_obj.amino_acid = AA[val[0].get_resname()] res_obj.display_generic_number = val[ 1].display_generic_number res_obj.generic_number = val[1].generic_number res_obj.protein_conformation = alpha_protconf res_obj.protein_segment = val[1].protein_segment res_obj.save() rot = self.create_structure_rotamer( val[0], res_obj, sc.structure) bulked_rotamers.append(rot) else: self.logger.info( "Skipped {} as no annotation was present, while building for alpha subunit of {}" .format(val[1], sc)) if options["debug"]: pprint.pprint(pdb_num_dict) Rotamer.objects.bulk_create(bulked_rotamers) self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished" .format(sc)) except Exception as msg: if options["debug"]: print("Error: ", sc, msg) self.logger.info( "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed" .format(sc)) if not options["s"]: ### Build SignprotStructure objects from non-complex signprots g_prot_alphas = Protein.objects.filter( family__slug__startswith="100_001", accession__isnull=False) #.filter(entry_name="gnai1_human") complex_structures = SignprotComplex.objects.all().values_list( "structure__pdb_code__index", flat=True) for a in g_prot_alphas: pdb_list = get_pdb_ids(a.accession) for pdb in pdb_list: if pdb not in complex_structures: try: data = self.fetch_gprot_data(pdb, a) if data: self.build_g_prot_struct(a, pdb, data) except Exception as msg: self.logger.error( "SignprotStructure of {} {} failed\n{}: {}". format(a.entry_name, pdb, type(msg), msg)) if options["debug"]: print(datetime.datetime.now() - startTime)
def add_cgn_residues(self, gprotein_list): #Parsing pdb uniprot file for residues self.logger.info('Start parsing PDB_UNIPROT_ENSEMBLE_ALL') self.logger.info('Parsing file ' + self.gprotein_data_file) residue_data = pd.read_table(self.gprotein_data_file, sep="\t", low_memory=False) residue_data = residue_data.loc[residue_data['Uniprot_ACC'].isin(gprotein_list)] cgn_scheme = ResidueNumberingScheme.objects.get(slug='cgn') # Temp files to speed things up temp = {} temp['proteins'] = {} temp['rgn'] = {} temp['segment'] = {} temp['equivalent'] = {} bulk = [] self.logger.info('Insert residues: {} rows'.format(len(residue_data))) for index, row in residue_data.iterrows(): if row['Uniprot_ACC'] in temp['proteins']: pr = temp['proteins'][row['Uniprot_ACC']][0] pc = temp['proteins'][row['Uniprot_ACC']][1] else: #fetch protein for protein conformation pr, c= Protein.objects.get_or_create(accession=row['Uniprot_ACC']) #fetch protein conformation pc, c= ProteinConformation.objects.get_or_create(protein_id=pr) temp['proteins'][row['Uniprot_ACC']] = [pr,pc] #fetch residue generic number rgnsp=[] if(int(row['CGN'].split('.')[2])<10): rgnsp = row['CGN'].split('.') rgn_new = rgnsp[0]+'.'+rgnsp[1]+'.0'+rgnsp[2] if rgn_new in temp['rgn']: rgn = temp['rgn'][rgn_new] else: rgn, c= ResidueGenericNumber.objects.get_or_create(label=rgn_new) temp['rgn'][rgn_new] = rgn else: if row['CGN'] in temp['rgn']: rgn = temp['rgn'][row['CGN']] else: rgn, c= ResidueGenericNumber.objects.get_or_create(label=row['CGN']) temp['rgn'][row['CGN']] = rgn #fetch protein segment id if row['CGN'].split(".")[1] in temp['segment']: ps = temp['segment'][row['CGN'].split(".")[1]] else: ps, c= ProteinSegment.objects.get_or_create(slug=row['CGN'].split(".")[1], proteinfamily='Gprotein') temp['segment'][row['CGN'].split(".")[1]] = ps try: bulk_r = Residue(sequence_number=row['Position'], protein_conformation=pc, amino_acid=row['Residue'], generic_number=rgn, display_generic_number=rgn, protein_segment=ps) # self.logger.info("Residues added to db") bulk.append(bulk_r) except: self.logger.error("Failed to add residues") if len(bulk) % 10000 == 0: self.logger.info('Inserted bulk {} (Index:{})'.format(len(bulk),index)) # print(len(bulk),"inserts!",index) Residue.objects.bulk_create(bulk) # print('inserted!') bulk = [] # Add also to the ResidueGenericNumberEquivalent table needed for single residue selection try: if rgn.label not in temp['equivalent']: ResidueGenericNumberEquivalent.objects.get_or_create(label=rgn.label,default_generic_number=rgn, scheme=cgn_scheme) temp['equivalent'][rgn.label] = 1 # self.logger.info("Residues added to ResidueGenericNumberEquivalent") except: self.logger.error("Failed to add residues to ResidueGenericNumberEquivalent") self.logger.info('Inserted bulk {} (Index:{})'.format(len(bulk),index)) Residue.objects.bulk_create(bulk)
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] # parse files for source_file in filenames: source_file_path = os.sep.join([self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error('Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species', 'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error('Parent protein {} for construct {} not found, aborting!'.format( sd['protein'], sd['name'])) continue # sequence type try: sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod', defaults={'name': 'Modified'}) if created: self.logger.info('Created sequence type {}'.format(sequence_type)) except IntegrityError: sequence_type = ProteinSequenceType.objects.get(slug='mod') # protein source try: protein_source, created = ProteinSource.objects.get_or_create(name='OTHER') if created: self.logger.info('Created protein source {}'.format(protein_source)) except IntegrityError: protein_source = ProteinSource.objects.get(name='OTHER') # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type= sequence_type p.source = protein_source p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info('Created construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) except: self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name)) except: self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name, p.entry_name)) # create residue records deletions = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0],t[1]+1)) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = int(m[1:-1]) mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } # insertions split_segments = {} if 'insertions' in sd and sd['insertions']: for ins in sd['insertions']: ins_start = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][0]) ins_end = Residue.objects.get(protein_conformation=ppc, sequence_number=ins['positions'][1]) # if the insertion is within only one segment (the usual case), split that # segment into two segments if ins_start and ins_start.protein_segment == ins_end.protein_segment: # get/create split protein segments slug_1 = ins_start.protein_segment.slug + "_1" try: segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_before)) except IntegrityError: segment_before = ProteinSegment.objects.get(slug=slug_1) slug_2 = ins_start.protein_segment.slug + "_2" try: segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2, defaults={'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True}) if created: self.logger.info('Created protein segment {}'.format(segment_after)) except IntegrityError: segment_after = ProteinSegment.objects.get(slug=slug_2) # keep track of information about split segments split_segments[ins_start.protein_segment.slug] = { 'start': { 'sequence_number': ins['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': ins['positions'][1], 'segment': segment_after, }, } # if the insertion covers two segments, use those two as the segments before and after elif ins_start: segment_before = ins_start.protein_segment segment_after = ins_end.protein_segment # if the insertion replaces a part of the sequence, add that range as a deletion if ins['positions'][1] > (ins['positions'][0] + 1): deletions += list(range((ins['positions'][0] + 1), ins['positions'][1])) # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={ 'sequence': ins['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number]['wt_res'] == pr.amino_acid: r.amino_acid = mutations[r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save()
def main_func(self, positions, iteration): # filenames if not positions[1]: filenames = self.filenames[positions[0]:] else: filenames = self.filenames[positions[0]:positions[1]] # parse files for source_file in filenames: source_file_path = os.sep.join( [self.construct_data_dir, source_file]) if os.path.isfile(source_file_path) and source_file[0] != '.': self.logger.info('Reading file {}'.format(source_file_path)) # read the yaml file with open(source_file_path, 'r') as f: sd = yaml.load(f) # is a protein specified? if 'protein' not in sd: self.logger.error( 'Protein not specified for construct, skipping') continue # fetch the parent protein try: ppc = ProteinConformation.objects.prefetch_related( 'protein__family', 'protein__species', 'protein__residue_numbering_scheme').get( protein__entry_name=sd['protein'], state__slug=settings.DEFAULT_PROTEIN_STATE) except ProteinConformation.DoesNotExist: # abort if parent protein is not found self.logger.error( 'Parent protein {} for construct {} not found, aborting!' .format(sd['protein'], sd['name'])) continue # sequence type try: sequence_type, created = ProteinSequenceType.objects.get_or_create( slug='mod', defaults={'name': 'Modified'}) if created: self.logger.info('Created sequence type {}'.format( sequence_type)) except IntegrityError: sequence_type = ProteinSequenceType.objects.get( slug='mod') # protein source try: protein_source, created = ProteinSource.objects.get_or_create( name='OTHER') if created: self.logger.info( 'Created protein source {}'.format( protein_source)) except IntegrityError: protein_source = ProteinSource.objects.get( name='OTHER') # create a protein record p = Protein() p.parent = ppc.protein p.family = ppc.protein.family p.species = ppc.protein.species p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme p.sequence_type = sequence_type p.source = protein_source p.entry_name = slugify(strip_tags(sd['name'])) p.name = sd['name'] p.sequence = ppc.protein.sequence # save protein (construct) try: p.save() self.logger.info( 'Created construct {} with parent protein {}'. format(p.name, ppc.protein.entry_name)) except: self.logger.error( 'Failed creating construct {} with parent protein {}' .format(p.name, ppc.protein.entry_name)) continue # create protein conformation record pc = ProteinConformation() pc.protein = p pc.state = ProteinState.objects.get( slug=settings.DEFAULT_PROTEIN_STATE) try: pc.save() self.logger.info( 'Created conformation {} of protein {}'.format( pc.state.name, p.name)) except: self.logger.error( 'Failed creating conformation {} of protein {}'. format(pc.state.name, p.entry_name)) # create residue records deletions = [] if 'deletions' in sd and sd['deletions']: for t in sd['deletions']: deletions += list(range(t[0], t[1] + 1)) mutations = {} if 'mutations' in sd and sd['mutations']: for m in sd['mutations']: res_num = int(m[1:-1]) mutations[res_num] = { 'wt_res': m[0], 'mut_res': m[-1], 'full': m, } # insertions split_segments = {} if 'insertions' in sd and sd['insertions']: for ins in sd['insertions']: ins_start = Residue.objects.get( protein_conformation=ppc, sequence_number=ins['positions'][0]) ins_end = Residue.objects.get( protein_conformation=ppc, sequence_number=ins['positions'][1]) # if the insertion is within only one segment (the usual case), split that # segment into two segments if ins_start and ins_start.protein_segment == ins_end.protein_segment: # get/create split protein segments slug_1 = ins_start.protein_segment.slug + "_1" try: segment_before, created = ProteinSegment.objects.get_or_create( slug=slug_1, defaults={ 'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True }) if created: self.logger.info( 'Created protein segment {}'. format(segment_before)) except IntegrityError: segment_before = ProteinSegment.objects.get( slug=slug_1) slug_2 = ins_start.protein_segment.slug + "_2" try: segment_after, created = ProteinSegment.objects.get_or_create( slug=slug_2, defaults={ 'name': ins_start.protein_segment.name, 'category': ins_start.protein_segment.category, 'partial': True }) if created: self.logger.info( 'Created protein segment {}'. format(segment_after)) except IntegrityError: segment_after = ProteinSegment.objects.get( slug=slug_2) # keep track of information about split segments split_segments[ ins_start.protein_segment.slug] = { 'start': { 'sequence_number': ins['positions'][0], 'segment': segment_before, }, 'end': { 'sequence_number': ins['positions'][1], 'segment': segment_after, }, } # if the insertion covers two segments, use those two as the segments before and after elif ins_start: segment_before = ins_start.protein_segment segment_after = ins_end.protein_segment # if the insertion replaces a part of the sequence, add that range as a deletion if ins['positions'][1] > (ins['positions'][0] + 1): deletions += list( range((ins['positions'][0] + 1), ins['positions'][1])) # get/insert fusion protein fusion, create = ProteinFusion.objects.get_or_create( name=ins['name'], defaults={'sequence': ins['sequence']}) # create relationship with protein ProteinFusionProtein.objects.create( protein=p, protein_fusion=fusion, segment_before=segment_before, segment_after=segment_after) prs = Residue.objects.filter( protein_conformation=ppc).prefetch_related( 'protein_conformation__protein', 'protein_segment', 'generic_number', 'display_generic_number__scheme', 'alternative_generic_numbers__scheme') updated_sequence = '' for pr in prs: if pr.sequence_number not in deletions: r = Residue() r.protein_conformation = pc r.generic_number = pr.generic_number r.display_generic_number = pr.display_generic_number r.sequence_number = pr.sequence_number # check for split segments if pr.protein_segment.slug in split_segments: rsns = split_segments[pr.protein_segment.slug][ 'start']['sequence_number'] rsne = split_segments[pr.protein_segment.slug][ 'end']['sequence_number'] if r.sequence_number <= rsns: r.protein_segment = split_segments[ pr.protein_segment. slug]['start']['segment'] elif r.sequence_number >= rsne: r.protein_segment = split_segments[ pr.protein_segment. slug]['end']['segment'] else: r.protein_segment = pr.protein_segment # amino acid, check for mutations if r.sequence_number in mutations: if mutations[r.sequence_number][ 'wt_res'] == pr.amino_acid: r.amino_acid = mutations[ r.sequence_number]['mut_res'] else: self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \ + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name, ppc.protein.entry_name)) else: r.amino_acid = pr.amino_acid # save amino acid to updated sequence updated_sequence += r.amino_acid # save residue before populating M2M relations r.save() # alternative generic numbers agns = pr.alternative_generic_numbers.all() for agn in agns: r.alternative_generic_numbers.add(agn) # update sequence p.sequence = updated_sequence p.save()