Пример #1
0
def create_or_update_residue(protein_conformation, segment, schemes, residue,
                             b_and_c):
    logger = logging.getLogger('build')

    rns_defaults = {
        'protein_segment': segment
    }  # default numbering scheme for creating generic numbers

    # default numbering scheme
    ns = settings.DEFAULT_NUMBERING_SCHEME
    ns_obj = ResidueNumberingScheme.objects.get(slug=ns)

    rvalues = {}
    rvalues['protein_segment'] = segment
    rvalues['amino_acid'] = residue['aa']
    rvalues['generic_number'] = None
    rvalues['display_generic_number'] = None
    sequence_number = residue['pos']
    numbers = residue['numbers']

    if 'generic_number' in numbers:
        numbers = format_generic_numbers(
            protein_conformation.protein.residue_numbering_scheme, schemes,
            sequence_number, numbers['generic_number'], numbers['bw'], b_and_c)
        # print(numbers)
    # print(residues,numbers)
    # main generic number
    if 'generic_number' in numbers:
        gnl = numbers['generic_number']
        if gnl in schemes[ns]['generic_numbers']:
            rvalues['generic_number'] = schemes[ns]['generic_numbers'][gnl]
        else:
            try:
                gn, created = ResidueGenericNumber.objects.get_or_create(
                    scheme=ns_obj, label=gnl, defaults=rns_defaults)
                # if created:
                #     logger.info('Created generic number {}'.format(gn.label))
            except IntegrityError:
                gn = ResidueGenericNumber.objects.get(scheme=ns_obj, label=gnl)
            rvalues['generic_number'] = schemes[ns]['generic_numbers'][
                gnl] = gn

    # equivalent to main generic number
    if 'equivalent' in numbers:
        try:
            gn_equivalent, created = ResidueGenericNumberEquivalent.objects.get_or_create(
                default_generic_number=rvalues['generic_number'],
                scheme=protein_conformation.protein.residue_numbering_scheme,
                defaults={'label': numbers['equivalent']})
            # if created:
            #     logger.info('Created generic number equivalent {} ({}) for scheme {}'.format(
            #         numbers['equivalent'], numbers['generic_number'],
            #         protein_conformation.protein.residue_numbering_scheme))
        except IntegrityError:
            gn_equivalent = ResidueGenericNumberEquivalent.objects.get(
                default_generic_number=rvalues['generic_number'],
                scheme=protein_conformation.protein.residue_numbering_scheme)

    # display generic number
    if 'display_generic_number' in numbers:
        ns = protein_conformation.protein.residue_numbering_scheme.slug
        gnl = numbers['display_generic_number']
        if gnl in schemes[ns]['generic_numbers']:
            rvalues['display_generic_number'] = schemes[ns]['generic_numbers'][
                gnl]
        else:
            try:
                gn, created = ResidueGenericNumber.objects.get_or_create(
                    scheme=protein_conformation.protein.
                    residue_numbering_scheme,
                    label=gnl,
                    defaults=rns_defaults)
                # if created:
                #     logger.info('Created display generic number {}'.format(gn.label))
            except IntegrityError:
                gn = ResidueGenericNumber.objects.get(
                    scheme=protein_conformation.protein.
                    residue_numbering_scheme,
                    label=gnl)
            rvalues['display_generic_number'] = schemes[ns]['generic_numbers'][
                gnl] = gn

        # UPDATE or CREATE the residue
    # bulk_r = Residue(protein_conformation=protein_conformation,sequence_number=sequence_number,defaults = rvalues)
    bulk_r = Residue(protein_conformation=protein_conformation,
                     sequence_number=sequence_number,
                     amino_acid=rvalues['amino_acid'],
                     display_generic_number=rvalues['display_generic_number'],
                     generic_number=rvalues['generic_number'],
                     protein_segment=segment)
    # r, created = Residue.objects.update_or_create(protein_conformation=protein_conformation,
    #     sequence_number=sequence_number, defaults = rvalues)

    # alternative generic numbers
    # r.alternative_generic_numbers.clear() # remove any existing relations
    bulk_add_alt = []
    if (numbers and 'alternative_generic_numbers' in numbers):
        for alt_scheme, alt_num in numbers[
                'alternative_generic_numbers'].items():
            if alt_num in schemes[alt_scheme]['generic_numbers']:
                argn = schemes[alt_scheme]['generic_numbers'][alt_num]
            else:
                try:
                    argn, created = ResidueGenericNumber.objects.get_or_create(
                        scheme=ResidueNumberingScheme.objects.get(
                            slug=alt_scheme),
                        label=alt_num,
                        defaults=rns_defaults)
                except IntegrityError:
                    argn = ResidueGenericNumber.objects.get(
                        scheme=ResidueNumberingScheme.objects.get(
                            slug=alt_scheme),
                        label=alt_num)
                schemes[alt_scheme]['generic_numbers'][alt_num] = argn
            try:
                bulk_add_alt.append(argn)
                # r.alternative_generic_numbers.add(argn)
            except IntegrityError:
                print('argn already added?')
                pass
                # print('argn already added?')

    return [bulk_r, bulk_add_alt]
    def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()

        # Building protein and protconf objects for g protein structure in complex
        scs = SignprotComplex.objects.all()
        for sc in scs:
            self.logger.info(
                'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                .format(sc))
            try:
                # Alpha subunit
                try:
                    alpha_protein = Protein.objects.get(
                        entry_name=sc.structure.pdb_code.index.lower() + '_a')
                except:
                    alpha_protein = Protein()
                    alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.accession = None
                    alpha_protein.name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.sequence = sc.protein.sequence
                    alpha_protein.family = sc.protein.family
                    alpha_protein.parent = sc.protein
                    alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                    alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                        slug='mod')
                    alpha_protein.source = ProteinSource.objects.get(
                        name='OTHER')
                    alpha_protein.species = sc.protein.species
                    alpha_protein.save()
                try:
                    alpha_protconf = ProteinConformation.objects.get(
                        protein__entry_name=sc.structure.pdb_code.index.lower(
                        ) + '_a')
                except:
                    alpha_protconf = ProteinConformation()
                    alpha_protconf.protein = alpha_protein
                    alpha_protconf.state = ProteinState.objects.get(
                        slug='active')
                    alpha_protconf.save()
                pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                s = pdbp.get_structure('struct',
                                       StringIO(sc.structure.pdb_data.pdb))
                chain = s[0][sc.alpha]
                nums = []
                for res in chain:
                    try:
                        res['CA']
                        nums.append(res.get_id()[1])
                    except:
                        pass

                resis = Residue.objects.filter(
                    protein_conformation__protein=sc.protein)
                num_i = 0
                temp_seq2 = ''
                pdb_num_dict = OrderedDict()
                # Create first alignment based on sequence numbers
                for n in nums:
                    if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                        nr = n + 6
                    else:
                        nr = n
                    pdb_num_dict[n] = [chain[n], resis.get(sequence_number=nr)]
                # Find mismatches
                mismatches = []
                for n, res in pdb_num_dict.items():
                    if AA[res[0].get_resname()] != res[1].amino_acid:
                        mismatches.append(res)

                pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                seqadv = []
                for l in pdb_lines:
                    if l.startswith('SEQADV'):
                        seqadv.append(l)
                mutations, shifted_mutations = OrderedDict(), OrderedDict()
                # Search for annotated engineered mutations in pdb SEQADV
                for s in seqadv:
                    line_search = re.search(
                        'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                        s)
                    if line_search != None:
                        if line_search.group(2) == sc.alpha:
                            if line_search.group(
                                    4).strip() == sc.protein.accession:
                                if line_search.group(3) == line_search.group(
                                        6):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                else:
                                    shifted_mutations[int(
                                        line_search.group(3))] = [
                                            line_search.group(1),
                                            line_search.group(5),
                                            int(line_search.group(6))
                                        ]
                            else:
                                # Exception for 6G79
                                if line_search.group(3) != line_search.group(
                                        6) and 'CONFLICT' in line_search.group(
                                            7):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                # Exception for 5G53
                                if line_search.group(
                                        4).strip() != sc.protein.accession:
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                remaining_mismatches = []

                # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                for m in mismatches:
                    num = m[0].get_id()[1]
                    if num in mutations:
                        if m[0].get_resname() != mutations[num][0] and m[
                                1].amino_acid != AA[mutations[num][1]]:
                            remaining_mismatches.append(m)
                    elif num in shifted_mutations:
                        remaining_mismatches.append(m)
                    else:
                        remaining_mismatches.append(m)

                ### sanity check
                # print(mutations)
                # print(shifted_mutations)
                # print(mismatches)
                # print(remaining_mismatches)
                # pprint.pprint(pdb_num_dict)

                # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                if len(remaining_mismatches
                       ) > 0 and sc.structure.pdb_code.index not in [
                           '6OIJ', '6OY9', '6OYA'
                       ]:
                    ppb = PPBuilder()
                    seq = ''
                    for pp in ppb.build_peptides(chain, aa_only=False):
                        seq += str(pp.get_sequence())
                    pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2,
                                                  -1, -.5, -.1)
                    ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                    wt_pdb_dict = OrderedDict()
                    pdb_wt_dict = OrderedDict()
                    j, k = 0, 0
                    for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq,
                                            temp_seq):
                        if ref != '-' and temp != '-':
                            wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = resis[j]
                            j += 1
                            k += 1
                        elif ref == '-':
                            wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                            k += 1
                        elif temp == '-':
                            wt_pdb_dict[resis[j]] = i
                            pdb_wt_dict[i] = resis[j]
                            j += 1
                    for i, r in enumerate(remaining_mismatches):
                        # Adjust for shifted residue when residue is a match
                        if r[0].get_id()[1] - remaining_mismatches[
                                i - 1][0].get_id()[1] > 1:
                            pdb_num_dict[r[0].get_id()[1] -
                                         1][1] = pdb_wt_dict[chain[
                                             r[0].get_id()[1] - 1]]
                        # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                        if r[0].get_id()[1] in shifted_mutations:
                            pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                sequence_number=shifted_mutations[
                                    r[0].get_id()[1]][2])
                        # Adjust for shift
                        else:
                            pdb_num_dict[r[0].get_id()[1]][1] = pdb_wt_dict[
                                r[0]]

                bulked_residues = []
                for key, val in pdb_num_dict.items():
                    # print(key, val) # sanity check
                    res_obj = Residue()
                    res_obj.sequence_number = val[0].get_id()[1]
                    res_obj.amino_acid = AA[val[0].get_resname()]
                    res_obj.display_generic_number = val[
                        1].display_generic_number
                    res_obj.generic_number = val[1].generic_number
                    res_obj.protein_conformation = alpha_protconf
                    res_obj.protein_segment = val[1].protein_segment
                    bulked_residues.append(res_obj)
                Residue.objects.bulk_create(bulked_residues)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                    .format(sc))
            except Exception as msg:
                print(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))
                print(msg)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))
Пример #3
0
    def create_constructs(self, filenames):
        self.logger.info('CREATING CONSTRUCTS')
        
        # what files should be parsed?
        if not filenames:
            filenames = os.listdir(self.construct_data_dir)

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.select_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                        defaults={'name': 'Modified'})
                    p.source, created = ProteinSource.objects.get_or_create(name='OTHER')
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence
                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except Exception as e:
                        print(e)
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # create residue records
                    deletions = []
                    deletions_list = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))
                            deletions_list.append(str(t[0])+'-'+str(t[1])) 
                    s = ","
                    deletion_string = s.join(deletions_list)
                         

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = m[1:-1]
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }
                    
                    # Create construct record
                    c = Construct()            
                    c.protein_conformation = pc
                    c.deletions =  deletion_string
                    c.save()
                      

                    # Create Auxiliary proteins
#                    if 'auxiliary_proteins' in sd and sd['auxiliary_proteins']:
#                        ap = AuxProtein()
#                        ap.construct = c
#                        apct = AuxProteinType.objects.create()
                       # ap.protein_type = apct 
#                        apct.save()
#                        if 'remarks' in sd['auxiliary_proteins']:
#                            ap.remarks = sd['auxiliary_proteins']['remarks']
#                        ap.save()
 

#                        for step in sd['auxiliary_proteins']:
#                            if 'type' in step and 'name' in step and'sequence' in step:
#                                ap.protein_type = apct
                 #              ap.protein_type, created = AuxProteinType.objects.get_or_create()
#                                ap.name = sd['auxiliary_proteins']['name']
#                                ap.uniprot_id = sd['auxiliary_proteins']['uniprot_id']
#                                ap.sequence = sd['auxiliary_proteins']['sequence']
                                #mutations if any to be included from mutation model along with reason of mutation
#                                ap.position = sd['auxiliary_proteins']['position']
#                                ap.deletions = sd['auxiliary_proteins']['deletions']
                                
#                            else:
#                                self.logger.error('Auxiliary protein step incorrectly defined for {}'.format(p))



                     # create expression records
                    if 'expression_sys' in sd and sd['expression_sys']:
                        ce = ConstructExpression()           
                        ce.construct = c
                        ce.expression_system, created = ConstructExpressionSystem.objects.get_or_create(expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell'])
                        if 'remarks' in sd:
                            ce.remarks = sd['expression_sys']['remarks']
                        ce.save()
 
               
                    # create solubilization records
                    if 'solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']:
                        so = ConstructSolubilization()
                        so.construct = c
                        cl = ChemicalList.objects.create()
                        so.chemical_list = cl 

                        for step in sd['solubilization']['steps']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                                chem.name =  step['item']
                                chem.save()

                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Solubilization step incorrectly defined for {}'.format(p))                                 

                        if 'remarks' in sd['solubilization']:
                            so.remarks = sd['solubilization']['remarks']
                        so.save()



                    # create  purification records
                    if 'purification' in sd and sd['purification'] and sd['purification']['steps']:
                        pu = ConstructPurification()
                        pu.construct = c
                        if 'remarks' in sd['purification']:
                            pu.remarks = sd['purification']['remarks']
                        pu.save() 
                        for step in sd['purification']['steps']:
                            if 'type' in step and 'description' in step:
                                pust = PurificationStep()
                                pust.description = step['description']
                                pust.purification = pu
                                pust.purification_type, created = PurificationStepType.objects.get_or_create(name = step['type'] ) # 2 values returned by get_or_create
                                if created: 
                                    self.logger.info('Created purification step type {}'.format(pust.purification_type))
                                pust.save()

                            else:
                                self.logger.error('Purification step incorrectly defined for {}'.format(p))

                        


                   # create crystallization records
                    if 'crystallization' in sd and sd['crystallization']: 
                        cy = ConstructCrystallization()
                        cy.construct = c
                        cyt = CrystallizationMethodTypes.objects.create()
                        cy.crystal_type = cyt
                        cy.method = sd['crystallization']['method']
                        cy.settings = sd['crystallization']['settings']
                        cy.protein_conc = sd['crystallization']['protein_conc']
                        cl = ChemicalList.objects.create()
                        cy.chemical_list = cl   

                        for step in sd['crystallization']['chemicallist']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                                
                                chem.name =  step['item']
                                chem.save()
                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Crystallization step incorrectly defined for {}'.format(p))                        

                        cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP']
                        cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume']
                        cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume']
                        cy.temp = sd['crystallization']['temperature']
                        cy.ph = sd['crystallization']['ph']  


                        if 'remarks' in sd['crystallization']:
                            cy.remarks = sd['crystallization']['remarks']
                        cy.save()

                                     
                    # fusion proteins
                    split_segments = {}
                    if 'fusion_proteins' in sd and sd['fusion_proteins']:
                        for fp in sd['fusion_proteins']:
                            fp_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=fp['positions'][0])
                            fp_end = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][1])
                            # if the fusion protein is inserted within only one segment (the usual case), split that
                            # segment into two segments
                            if fp_start and fp_start.protein_segment == fp_end.protein_segment:
                                # get/create split protein segments
                                segment_before, created = ProteinSegment.objects.get_or_create(
                                    slug=fp_start.protein_segment.slug+"_1", defaults={
                                    'name': fp_start.protein_segment.name,
                                    'category': fp_start.protein_segment.category,
                                    'partial': True})
                                segment_after, created = ProteinSegment.objects.get_or_create(
                                    slug=fp_start.protein_segment.slug+"_2", defaults={
                                    'name': fp_start.protein_segment.name,
                                    'category': fp_start.protein_segment.category,
                                    'partial': True})

                                # keep track of  information about split segments
                                split_segments[fp_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': fp['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': fp['positions'][1],
                                        'segment': segment_after,
                                    },
                                }

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=fp['name'], defaults={
                                'sequence': fp['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number
                            
                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)
                    
                    # update sequence
                    p.sequence = updated_sequence
                    p.save()

        self.logger.info('COMPLETED CREATING CONSTRUCTS')
    def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options['only_signprot_structures']:
            # Building protein and protconf objects for g protein structure in complex
            scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            '_a')
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')
                        alpha_protein.source = ProteinSource.objects.get(
                            name='OTHER')
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + '_a')
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug='active')
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure('struct',
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        try:
                            res['CA']
                            nums.append(res.get_id()[1])
                        except:
                            pass

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ''
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith('SEQADV'):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and 'CONFLICT' in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    ### sanity check
                    # print(sc)
                    # print(mutations)
                    # print(shifted_mutations)
                    # print(mismatches)
                    # print('======')
                    # print(remaining_mismatches)
                    # pprint.pprint(pdb_num_dict)

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(remaining_mismatches
                           ) > 0 and sc.structure.pdb_code.index not in [
                               '6OIJ', '6OY9', '6OYA', '6LPB', '6WHA'
                           ]:
                        ppb = PPBuilder()
                        seq = ''
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        pw2 = pairwise2.align.localms(sc.protein.sequence, seq,
                                                      2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            # print(i, ref, temp) # alignment check
                            if ref != '-' and temp != '-':
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == '-':
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == '-':
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        for i, r in enumerate(remaining_mismatches):
                            # Adjust for shifted residue when residue is a match
                            if r[0].get_id()[1] - remaining_mismatches[
                                    i - 1][0].get_id()[1] > 1:
                                pdb_num_dict[r[0].get_id()[1] -
                                             1][1] = pdb_wt_dict[chain[
                                                 r[0].get_id()[1] - 1]]
                            # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                            if r[0].get_id()[1] in shifted_mutations:
                                pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                    sequence_number=shifted_mutations[
                                        r[0].get_id()[1]][2])
                            # Adjust for shift
                            else:
                                pdb_num_dict[r[0].get_id()
                                             [1]][1] = pdb_wt_dict[r[0]]
                    # Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    # elif sc.structure.pdb_code.index=='6WHA':
                    #     ref_seq  = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMY--SHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                    #     temp_seq = '----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV'
                    #     for i, ref, temp in zip(range(0,len(ref_seq)), ref_seq, temp_seq):
                    #         print(i, ref, temp)
                    #     pprint.pprint(pdb_num_dict)

                    bulked_residues = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            bulked_residues.append(res_obj)
                        else:
                            self.logger.info(
                                'Skipped {} as no annotation was present, while building for alpha subunit of {}'
                                .format(val[1], sc))

                    Residue.objects.bulk_create(bulked_residues)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                        .format(sc))
                except Exception as msg:
                    #print('Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'.format(sc))
                    #print(msg)
                    #print(traceback.format_exc())
                    #exit(0)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                        .format(sc))

        ### Build SignprotStructure objects from non-complex signprots
        g_prot_alphas = Protein.objects.filter(
            family__slug__startswith='100_001',
            accession__isnull=False)  #.filter(entry_name='gnai1_human')
        complex_structures = SignprotComplex.objects.all().values_list(
            'structure__pdb_code__index', flat=True)
        for a in g_prot_alphas:
            pdb_list = get_pdb_ids(a.accession)
            for pdb in pdb_list:
                if pdb not in complex_structures:
                    try:
                        data = self.fetch_gprot_data(pdb, a)
                        if data:
                            self.build_g_prot_struct(a, pdb, data)
                    except Exception as msg:
                        self.logger.error(
                            'SignprotStructure of {} {} failed\n{}: {}'.format(
                                a.entry_name, pdb, type(msg), msg))
Пример #5
0
    def create_residues(self, args):

        schemes = {
            'gpcrdb': {'type': False},
            'gpcrdba': {
                'type': 'structure',
                'seq_based': 'bw',
            },
            'gpcrdbb': {
                'type': 'structure',
                'seq_based': 'woot',
            },
            'gpcrdbc': {
                'type': 'structure',
                'seq_based': 'pin',
            },
            'gpcrdbf': {
                'type': 'structure',
                'seq_based': 'wang',
            },
            'bw': {'type': 'sequence'},
            'woot': {'type': 'sequence'},
            'pin': {'type': 'sequence'},
            'wang': {'type': 'sequence'},
        }

        for scheme_name, scheme in schemes.items():
            schemes[scheme_name]['obj'] = ResidueNumberingScheme.objects.get(slug=scheme_name)
            mapping_file = os.sep.join([self.generic_numbers_source_dir, 'mapping_' + scheme_name + '.txt'])
            if os.path.isfile(mapping_file):
                with open(mapping_file, "r", encoding='UTF-8') as scheme_table_file:
                    schemes[scheme_name]['table'] = {}
                    for row in scheme_table_file:
                        split_row = shlex.split(row)
                        schemes[scheme_name]['table'][split_row[0]] = split_row[1]
        missing_proteins = []
        self.logger.info('CREATING RESIDUES')
        for arg in args:
            if os.path.exists(os.sep.join([self.dump_source_dir, arg])):
                residue_data_fh = open(os.sep.join([self.dump_source_dir, arg]), 'r')
                self.logger.info('Parsing residue data from {}'.format(arg))
            else:
                print("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                self.logger.error("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                continue
            for line in residue_data_fh:
                id,res_num,res_name,oli,gpcrdb,bw,bw2,bs,prot_name,sec_str_name = [x.strip().strip('"') for x in line.split(',')] #double strip due to some weird bug...
                if prot_name in missing_proteins:
                    continue
                
                # fetch schemes and conversion tables
                #Checking if the protein exists in the db
                try:
                    pconf = ProteinConformation.objects.get(protein__entry_name=prot_name,
                        state__slug=settings.DEFAULT_PROTEIN_STATE)
                except ProteinConformation.DoesNotExist as e:
                    missing_proteins.append(prot_name)
                    continue
                #Checking if given residue already exists in the db
                try:
                    Residue.objects.get(protein_conformation=pconf.id, sequence_number=res_num)
                    continue
                except Residue.DoesNotExist as e:
                    pass

                r = Residue()
                r.protein_conformation = pconf
                r.sequence_number = int(res_num)
                r.amino_acid = polypeptide.three_to_one(res_name.upper())
                
                generic_numbers = []
                
                try:
                    r.save()
                    self.logger.info('Created residue {:n}{!s} for protein {!s}'.format(r.sequence_number,
                        r.amino_acid, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error('Failed to create residue {:n}{!s} for protein {!s}'.format(
                        r.sequence_number, r.amino_acid, pconf.protein.entry_name))
                    continue
                  
                # residue segment
                dump_segment = sec_str_name
                try:
                    r.protein_segment = ProteinSegment.objects.get(slug=dump_segment)
                except:
                    self.logger.error('Failed to fetch protein segment {}'.format(dump_segment))

                # generic number
                if (str(oli) != '0' and gpcrdb != 'None' and bw != 'None'):
                    # separate bulge number (1241 - > 124 + 1)
                    bulge_prime = ''
                    dump_oliveira = str(oli)
                    if len(dump_oliveira) == 4:
                        bulge_prime = dump_oliveira[3]
                        dump_oliveira = dump_oliveira[:3]
                    dump_gpcrdb = gpcrdb[:4]
                    dump_seq_based = bw

                    # default gpcrdb number
                    def_gpcrdb = False
                    if dump_oliveira in schemes[settings.DEFAULT_NUMBERING_SCHEME]['table']:
                        default_label = (schemes[settings.DEFAULT_NUMBERING_SCHEME]['table'][dump_oliveira] + 
                            bulge_prime)
                        try:
                            def_gpcrdb = ResidueGenericNumber.objects.get(label=default_label,
                                scheme=schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'])
                        except ResidueGenericNumber.DoesNotExist as e:
                            def_gpcrdb = ResidueGenericNumber()
                            def_gpcrdb.label = default_label
                            def_gpcrdb.scheme = schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj']
                            def_gpcrdb.protein_segment = r.protein_segment
                            def_gpcrdb.save()
                            self.logger.info('Created generic number {:s} in numbering scheme {:s}'
                                .format(default_label,
                                schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'].short_name))
                                    
                    # if default number was found/added successfully, process the alternative numbers
                    if def_gpcrdb:
                        # add default generic number to residue record
                        r.generic_number = def_gpcrdb

                        # dict of sequence-based numbers, for use in structure-based numbers (5.46x461)
                        seq_based_labels = {}

                        # sequence-based schemes first (the sequence-based numbers are needed for the
                        # structure based schemes)
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'sequence':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == schemes[pconf.protein.residue_numbering_scheme.slug]['seq_based']:
                                    seq_based_label = dump_seq_based
                                # if not convert the number to the correct scheme
                                else:
                                    slug = pconf.protein.residue_numbering_scheme.slug
                                    for d, c in schemes[schemes[slug]['seq_based']]['table'].items():
                                        if c == dump_seq_based:
                                            seq_based_label = scheme['table'][d]
                                            break

                                # fetch/insert the number
                                try:
                                    seq_based = ResidueGenericNumber.objects.get(label=seq_based_label,
                                        scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    seq_based = ResidueGenericNumber()
                                    seq_based.label = seq_based_label
                                    seq_based.scheme = scheme['obj']
                                    seq_based.protein_segment = r.protein_segment
                                    seq_based.save()
                                r.alternative_generic_numbers.add(seq_based)

                                # add added number to the dict for later use
                                seq_based_labels[scheme_name] = seq_based_label
                                                
                        # structure-based numbers
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'structure':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    struct_based_label = dump_gpcrdb + bulge_prime
                                # if not convert the number to the correct scheme
                                else:
                                    for d, c in schemes[pconf.protein.residue_numbering_scheme.slug]['table'].items():
                                        if c == dump_gpcrdb:
                                            struct_based_label = scheme['table'][d] + bulge_prime
                                            break

                                # add the sequence-based label (5x461 -> 5.46x461)
                                split_struct_based_label = struct_based_label.split('x')
                                struct_based_label = (seq_based_labels[scheme['seq_based']] + 'x' +
                                    split_struct_based_label[1])

                                # fetch/insert the number
                                try:
                                    struct_based = ResidueGenericNumber.objects.get(
                                        label=struct_based_label, scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    struct_based = ResidueGenericNumber()
                                    struct_based.label = struct_based_label
                                    struct_based.scheme = scheme['obj']
                                    struct_based.protein_segment = r.protein_segment
                                    struct_based.save()
                                                
                                # add to residue as a display number or alternative number?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    r.display_generic_number = struct_based
                                else:
                                    r.alternative_generic_numbers.add(struct_based)
                try:
                    r.save()
                    self.logger.info('Added generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                        res_name, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error(
                        'Failed to create generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                            res_name, pconf.protein.entry_name))
        self.logger.info('COMPLETED CREATING RESIDUES')
Пример #6
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                            defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(name='OTHER')
                        if created:
                            self.logger.info('Created protein source {}'.format(protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type= sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except:
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # process deletions (save in db, and for sequence processing)
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))
                            deletion = ConstructDeletion.objects.create(construct=pc, start=t[0], end=t[1])
                            if created:
                                self.logger.info('Created deletion {}-{} for {}'.format(t[0], t[1],
                                    pc.protein.entry_name))

                    # process mutations (save in db, and for sequence processing)
                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }
                            mutation = ConstructMutation.objects.get_or_create(
                                construct=pc,
                                sequence_number=res_num,
                                wild_type_amino_acid=m[0],
                                mutated_amino_acid=m[-1],
                            )

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[ins_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': ins['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': ins['positions'][1],
                                        'segment': segment_after,
                                    },
                                }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(range((ins['positions'][0] + 1), ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={
                                'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    # create expression records
                    if 'expression_sys' in sd and sd['expression_sys']:
                        ce = Expression()           
                        ce.construct = pc
                        ce.expression_system, created = ExpressionSystem.objects.get_or_create(
                            expression_method = sd['expression_sys']['expression_method'],
                            host_cell_type = sd['expression_sys']['host_cell_type'],
                            host_cell = sd['expression_sys']['host_cell'])
                        if 'remarks' in sd:
                           ce.remarks = sd['expression_sys']['remarks']
                        ce.save()
                    
                    # create solubilization records
                    if ('solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization']
                        and sd['solubilization']['steps']):
                        so = Solubilization()
                        so.construct = pc
                        cl = ChemicalList.objects.create()
                        so.chemical_list = cl 

                        for step in sd['solubilization']['steps']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type'])
                                chem.name =  step['item']
                                chem.save()

                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Solubilization step incorrectly defined for {}'.format(p))

                        if 'remarks' in sd['solubilization']:
                            so.remarks = sd['solubilization']['remarks']
                        so.save()
                    
                    # create  purification records
                    if 'purification' in sd and sd['purification'] and sd['purification']['steps']:
                        pu = Purification()
                        pu.construct = pc
                        if 'remarks' in sd['purification']:
                            pu.remarks = sd['purification']['remarks']
                        pu.save() 
                        for step in sd['purification']['steps']:
                            if 'type' in step and 'description' in step:
                                pust = PurificationStep()
                                pust.description = step['description']
                                pust.purification = pu
                                pust.purification_type, created = PurificationStepType.objects.get_or_create(
                                    name = step['type'] ) # 2 values returned by get_or_create
                                if created: 
                                    self.logger.info('Created purification step type {}'.format(
                                        pust.purification_type))
                                pust.save()

                            else:
                                self.logger.error('Purification step incorrectly defined for {}'.format(p))
                    
                    # create crystallization records
                    if 'crystallization' in sd and sd['crystallization']: 
                        cy = Crystallization()
                        cy.construct = pc
                        cyt = CrystallizationMethodTypes.objects.create()
                        cy.crystal_type = cyt
                        cy.method = sd['crystallization']['method']
                        cy.settings = sd['crystallization']['settings']
                        cy.protein_conc = sd['crystallization']['protein_conc']
                        cl = ChemicalList.objects.create()
                        cy.chemical_list = cl

                        for step in sd['crystallization']['chemicallist']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 

                                chem.name =  step['item']
                                chem.save()
                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()

                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Crystallization step incorrectly defined for {}'.format(p))                        

                        cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP']
                        cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume']
                        cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume']
                        cy.temp = sd['crystallization']['temperature']
                        cy.ph = sd['crystallization']['ph']  


                        if 'remarks' in sd['crystallization']:
                            cy.remarks = sd['crystallization']['remarks']
                        cy.save()
                    
                    # create residues
                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()
Пример #7
0
    def create_rotamers(self, structure, pdb_path):
        wt_lookup = {} #used to match WT seq_number to WT residue record
        pdbseq = {} #used to keep track of pdbseq residue positions vs index in seq
        ref_positions = {} #WT postions in alignment
        mapped_seq = {} # index in contruct, tuple of AA and WT [position,AA]

        preferred_chain = structure.preferred_chain

        if len(preferred_chain.split(','))>1: #if A,B
            preferred_chain = preferred_chain.split(',')[0]


        AA = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D',
     'CYS':'C', 'GLN':'Q', 'GLU':'E', 'GLY':'G',
     'HIS':'H', 'ILE':'I', 'LEU':'L', 'LYS':'K',
     'MET':'M', 'PHE':'F', 'PRO':'P', 'SER':'S',
     'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'}


        s = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', pdb_path)[0]
        chain = s[preferred_chain] #select only one chain (avoid n-mer receptors)
        ppb=PPBuilder()
        seq = ''
        i = 1

        check_1000 = 0
        for pp in ppb.build_peptides(chain): #remove >1000 pos (fusion protein / gprotein)
            for res in pp:
                id = res.id
                if id[1]<600: 
                    check_1000 += 1
                    #need check_1000 to catch structures where they lie in 1000s (4LDE, 4LDL, 4LDO, 4N4W, 4QKX)
                if id[1]>1000 and check_1000>200: 
                    chain.detach_child(id)

        for pp in ppb.build_peptides(chain): 
            seq += str(pp.get_sequence()) #get seq from fasta (only chain A)
            for residue in pp:
                residue_id = residue.get_full_id()
                chain = residue_id[2]
                if chain not in pdbseq:
                    pdbseq[chain] = {}
                pos = residue_id[3][1]
                pdbseq[chain][pos] = [i,AA[residue.resname]]
                i += 1

        parent_seq = str(structure.protein_conformation.protein.parent.sequence)

        rs = Residue.objects.filter(protein_conformation__protein=structure.protein_conformation.protein.parent).prefetch_related('display_generic_number','generic_number','protein_segment')

        for r in rs: #required to match WT position to a record (for duplication of GN values)
            wt_lookup[r.sequence_number] = r

        #align WT with structure seq -- make gaps penalties big, so to avoid too much overfitting
        pw2 = pairwise2.align.localms(parent_seq, seq, 2, -4, -4, -.1)

        gaps = 0
        unmapped_ref = {}
        for i, r in enumerate(pw2[0][0], 1): #loop over alignment to create lookups (track pos)
            #print(i,r,pw2[0][1][i-1]) #print alignment for sanity check
            if r == "-":
                gaps += 1
            if r != "-":
                ref_positions[i] = [i-gaps,r]
            elif r == "-":
                ref_positions[i] = [None,'-']

            if pw2[0][1][i-1]=='-':
                unmapped_ref[i-gaps] = '-'

        gaps = 0
        for i, r in enumerate(pw2[0][1], 1): #make second lookup
            if r == "-":
                gaps += 1
            if r != "-":
                mapped_seq[i-gaps] = [r,ref_positions[i]]


        pdb = structure.pdb_data.pdb
        protein_conformation=structure.protein_conformation
        temp = ''
        check = 0
        errors = 0
        mismatch_seq = 0
        match_seq = 0
        not_matched = 0
        matched_by_pos = 0
        aa_mismatch = 0

        pdblines_temp = pdb.splitlines()
        pdblines = []
        for line in pdblines_temp: #Get rid of all odd records
            if line.startswith('ATOM'):
                pdblines.append(line)
        pdblines.append('') #add a line to not "run out"

        for i,line in enumerate(pdblines):
            if line.startswith('ATOM'): 
                chain = line[21]
                if preferred_chain and chain!=preferred_chain: #If perferred is defined and is not the same as the current line, then skip
                    pass
                else:   
                    nextline = pdblines[i+1]
                    residue_number = line[22:26].strip()
                    if (check==0 or nextline[22:26].strip()==check) and nextline.startswith('TER')==False and nextline.startswith('ATOM')==True: #If this is either the begining or the same as previous line add to current rotamer
                        temp += line + "\n"
                        #print('same res',pdb.splitlines()[i+1])
                    else: #if this is a new residue
                        #print(pdb.splitlines()[i+1][22:26].strip(),check)
                        temp += line + "\n"
                        if int(check.strip())<2000:
                            residue = Residue()
                            residue.sequence_number = int(check.strip())
                            residue.amino_acid = AA[residue_name.upper()]
                            residue.protein_conformation = protein_conformation

                            #print(residue.sequence_number,residue.amino_acid) #sanity check
                            try:
                                seq_num_pos = pdbseq[chain][residue.sequence_number][0]
                            except:
                                #print('failed residue',pdb_path,residue.sequence_number)
                                temp = "" #start new line for rotamer
                                check = pdblines[i+1][22:26].strip()
                                continue
                            if seq_num_pos in mapped_seq:
                                if mapped_seq[seq_num_pos][1][0]==None:
                                    #print('no match found') #sanity check
                                    #print(residue.sequence_number,residue.amino_acid) #sanity check
                                    residue.display_generic_number = None
                                    residue.generic_number = None
                                    residue.protein_segment = None
                                    not_matched +=1
                                else:
                                    wt_r = wt_lookup[mapped_seq[seq_num_pos][1][0]]
                                    if residue.sequence_number!=wt_r.sequence_number and residue.amino_acid!=wt_r.amino_acid and residue.sequence_number in wt_lookup: #if pos numbers not work -- see if the pos number might be in WT and unmapped
                                        if wt_lookup[residue.sequence_number].amino_acid==residue.amino_acid:
                                            if residue.sequence_number in unmapped_ref: #WT was not mapped, so could be it
                                               # print(residue.sequence_number,residue.amino_acid) #sanity check
                                                #print('wrongly matched, better match on pos+aa',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                                wt_r = wt_lookup[residue.sequence_number]
                                                matched_by_pos +=1
                                                match_seq += 1
                                            else:
                                                mismatch_seq += 1
                                                #print('could have been matched, but already aligned to another position',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        else:
                                            #print('WT pos not same AA, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                            mismatch_seq += 1
                                    elif residue.sequence_number!=wt_r.sequence_number:
                                        #print('WT pos not same pos, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        mismatch_seq += 1
                                    elif residue.amino_acid!=wt_r.amino_acid:
                                        #print('aa mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        aa_mismatch += 1

                                    else:
                                        match_seq += 1
                                    if wt_r.generic_number is not None:
                                        residue.display_generic_number = wt_r.display_generic_number
                                        residue.generic_number = wt_r.generic_number 
                                    else:
                                        residue.display_generic_number = None
                                        residue.generic_number = None
                                        #print('no GN')
                                    residue.protein_segment = wt_r.protein_segment
                            else:
                                #print('wierd error') #sanity check
                                residue.display_generic_number = None
                                residue.generic_number = None
                                residue.protein_segment = None

                            #print('inserted',residue.sequence_number) #sanity check
                            residue.save()

                            rotamer_data, created = PdbData.objects.get_or_create(pdb=temp)
                            rotamer, created = Rotamer.objects.get_or_create(residue=residue, structure=structure, pdbdata=rotamer_data)

                        temp = "" #start new line for rotamer
                        check = pdblines[i+1][22:26].strip()
                    
                    check = pdblines[i+1][22:26].strip()
                chain = line[21]
                residue_name = line[17:20].title() #use title to get GLY to Gly so it matches
        #print(structure.pdb_code.index,'length',len(seq),len(mapped_seq),'mapped res',str(mismatch_seq+match_seq+aa_mismatch),'pos mismatch',mismatch_seq,'aa mismatch',aa_mismatch,'not mapped',not_matched,' mapping off, matched on pos,aa',matched_by_pos)
        return None
    def handle(self, *args, **options):
        startTime = datetime.datetime.now()
        self.options = options
        if self.options["purge"]:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith="_a",
                protein_conformation__protein__family__parent__parent__name=
                "Alpha").delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith="_a",
                protein__family__parent__parent__name="Alpha").delete()
            Protein.objects.filter(
                entry_name__endswith="_a",
                family__parent__parent__name="Alpha").delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options["only_signprot_structures"]:
            # Building protein and protconf objects for g protein structure in complex
            if options["s"]:
                scs = SignprotComplex.objects.filter(
                    structure__pdb_code__index__in=[
                        i.upper() for i in options["s"]
                    ])
            else:
                scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    "Protein, ProteinConformation and Residue build for alpha subunit of {} is building"
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            "_a")
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug="mod")
                        alpha_protein.source = ProteinSource.objects.get(
                            name="OTHER")
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + "_a")
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug="active")
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure("struct",
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        if "CA" in res and res.id[0] == " ":
                            nums.append(res.get_id()[1])

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ""
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == "6OIJ" and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split("\n")
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith("SEQADV"):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)",
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and "CONFLICT" in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    if options["debug"]:
                        print(sc)
                        print(mutations)
                        print(shifted_mutations)
                        print(mismatches)
                        print("======")
                        print(remaining_mismatches)
                        pprint.pprint(pdb_num_dict)

                    no_seqnum_shift = [
                        '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U',
                        '7L1V'
                    ]

                    # Check if HN is mutated to GNAI1 for the scFv16 stabilizer
                    if sc.protein.entry_name != 'gnai1_human' and len(
                            remaining_mismatches) > 0:
                        target_HN = resis.filter(protein_segment__slug='HN')
                        gnai1_HN = Residue.objects.filter(
                            protein_conformation__protein__entry_name=
                            'gnai1_human',
                            protein_segment__slug='HN')
                        pdb_HN_seq = ''
                        for num, val in pdb_num_dict.items():
                            if num <= target_HN.reverse()[0].sequence_number:
                                pdb_HN_seq += Polypeptide.three_to_one(
                                    val[0].get_resname())
                        if options['debug']:
                            print('Checking if HN is gnai1_human')
                            print(pdb_HN_seq)
                            print(''.join(
                                gnai1_HN.values_list('amino_acid', flat=True)))
                        gnai1_HN_seq = ''.join(
                            gnai1_HN.values_list('amino_acid', flat=True))
                        pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq,
                                                      3, -4, -3, -1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        length, match = 0, 0
                        for r, t in zip(ref_seq, temp_seq):
                            if options['debug']:
                                print(r, t)
                            if t != '-':
                                if r == t:
                                    match += 1
                                length += 1
                        identity = match / length * 100
                        if options['debug']:
                            print(identity)
                        if identity > 85:
                            if sc.structure.pdb_code.index not in ['7DFL']:
                                no_seqnum_shift.append(
                                    sc.structure.pdb_code.index)
                            if options['debug']:
                                print(
                                    'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction'
                                    .format(round(identity)))

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(
                            remaining_mismatches
                    ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift:
                        ppb = PPBuilder()
                        seq = ""
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        if sc.structure.pdb_code.index in [
                                '7JVQ', '7L1U', '7L1V'
                        ]:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 3, -4, -3, -1)
                        else:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])

                        # Custom fix for A->G mutation at pos 18
                        if sc.structure.pdb_code.index == '7JJO':
                            ref_seq = ref_seq[:18] + ref_seq[19:]
                            temp_seq = temp_seq[:17] + temp_seq[18:]
                        # Custom alignment fixes
                        elif sc.structure.pdb_code.index == '7DFL':
                            ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                            temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                        elif sc.structure.pdb_code.index == '7JOZ':
                            temp_seq = temp_seq[:67] + (
                                '-' * 14) + 'FNGDS' + temp_seq[86:]
                        elif sc.structure.pdb_code.index == '7AUE':
                            ref_seq = ref_seq[:31].replace('-',
                                                           '') + ref_seq[31:]
                            temp_seq = (
                                9 *
                                '-') + temp_seq[2:5] + temp_seq[5:54].replace(
                                    '-', '') + temp_seq[54:]
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            if options["debug"]:
                                print(i, ref, temp)  # alignment check
                            if ref != "-" and temp != "-":
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == "-":
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == "-":
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        # Custom fix for 7JJO isoform difference
                        if sc.structure.pdb_code.index in [
                                '7JJO', '7JOZ', '7AUE'
                        ]:
                            pdb_num_dict = OrderedDict()
                            for wt_res, st_res in wt_pdb_dict.items():
                                if type(st_res) == type([]):
                                    pdb_num_dict[wt_res.sequence_number] = [
                                        st_res[0], wt_res
                                    ]
                        else:
                            for i, r in enumerate(remaining_mismatches):
                                # Adjust for shifted residue when residue is a match
                                if r[0].get_id()[1] - remaining_mismatches[
                                        i - 1][0].get_id()[1] > 1:
                                    pdb_num_dict[r[0].get_id()[1] -
                                                 1][1] = pdb_wt_dict[chain[
                                                     r[0].get_id()[1] - 1]]
                                # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                                if r[0].get_id()[1] in shifted_mutations:
                                    pdb_num_dict[
                                        r[0].get_id()[1]][1] = resis.get(
                                            sequence_number=shifted_mutations[
                                                r[0].get_id()[1]][2])
                                # Adjust for shift
                                else:
                                    pdb_num_dict[r[0].get_id()
                                                 [1]][1] = pdb_wt_dict[r[0]]
                            if sc.structure.pdb_code.index == '7JVQ':
                                pdb_num_dict[198][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=346)
                                pdb_num_dict[235][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=383)
                            elif sc.structure.pdb_code.index == '6PB0':
                                pdb_num_dict[205][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=205)
                    ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    elif sc.structure.pdb_code.index == "6WHA":
                        ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV"
                        temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV"
                        pdb_num_dict = OrderedDict()
                        temp_resis = [res for res in chain]
                        temp_i = 0
                        mapped_cgns = []
                        for i, aa in enumerate(temp_seq):
                            if aa != "-":
                                ref_split_on_gaps = ref_seq[:i + 1].split("-")
                                ref_seqnum = i - (len(ref_split_on_gaps) -
                                                  1) + 1
                                res = resis.get(sequence_number=ref_seqnum)
                                if res.display_generic_number.label in mapped_cgns:
                                    next_presumed_cgn = self.get_next_presumed_cgn(
                                        res)
                                    if next_presumed_cgn:
                                        res = next_presumed_cgn
                                        while res and res.display_generic_number.label in mapped_cgns:
                                            res = self.get_next_presumed_cgn(
                                                res)
                                    else:
                                        print(
                                            "Error: {} CGN does not exist. Incorrect mapping of {} in {}"
                                            .format(next_presumed_cgn,
                                                    chain[nums[temp_i]],
                                                    sc.structure))
                                mapped_cgns.append(
                                    res.display_generic_number.label)
                                pdb_num_dict[nums[temp_i]] = [
                                    chain[nums[temp_i]], res
                                ]
                                temp_i += 1

                    bulked_rotamers = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            res_obj.save()
                            rot = self.create_structure_rotamer(
                                val[0], res_obj, sc.structure)
                            bulked_rotamers.append(rot)
                        else:
                            self.logger.info(
                                "Skipped {} as no annotation was present, while building for alpha subunit of {}"
                                .format(val[1], sc))
                    if options["debug"]:
                        pprint.pprint(pdb_num_dict)
                    Rotamer.objects.bulk_create(bulked_rotamers)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished"
                        .format(sc))
                except Exception as msg:
                    if options["debug"]:
                        print("Error: ", sc, msg)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed"
                        .format(sc))

        if not options["s"]:
            ### Build SignprotStructure objects from non-complex signprots
            g_prot_alphas = Protein.objects.filter(
                family__slug__startswith="100_001",
                accession__isnull=False)  #.filter(entry_name="gnai1_human")
            complex_structures = SignprotComplex.objects.all().values_list(
                "structure__pdb_code__index", flat=True)
            for a in g_prot_alphas:
                pdb_list = get_pdb_ids(a.accession)
                for pdb in pdb_list:
                    if pdb not in complex_structures:
                        try:
                            data = self.fetch_gprot_data(pdb, a)
                            if data:
                                self.build_g_prot_struct(a, pdb, data)
                        except Exception as msg:
                            self.logger.error(
                                "SignprotStructure of {} {} failed\n{}: {}".
                                format(a.entry_name, pdb, type(msg), msg))

        if options["debug"]:
            print(datetime.datetime.now() - startTime)
Пример #9
0
    def add_cgn_residues(self, gprotein_list):
        #Parsing pdb uniprot file for residues
        self.logger.info('Start parsing PDB_UNIPROT_ENSEMBLE_ALL')
        self.logger.info('Parsing file ' + self.gprotein_data_file)
        residue_data =  pd.read_table(self.gprotein_data_file, sep="\t", low_memory=False)
        residue_data = residue_data.loc[residue_data['Uniprot_ACC'].isin(gprotein_list)]
        cgn_scheme = ResidueNumberingScheme.objects.get(slug='cgn')


        # Temp files to speed things up
        temp = {}
        temp['proteins'] = {}
        temp['rgn'] = {}
        temp['segment'] = {}
        temp['equivalent'] = {}
        bulk = []
        

        self.logger.info('Insert residues: {} rows'.format(len(residue_data)))
        for index, row in residue_data.iterrows():

            if row['Uniprot_ACC'] in temp['proteins']:
                pr = temp['proteins'][row['Uniprot_ACC']][0]
                pc = temp['proteins'][row['Uniprot_ACC']][1]
            else:
                #fetch protein for protein conformation
                pr, c= Protein.objects.get_or_create(accession=row['Uniprot_ACC'])

                #fetch protein conformation
                pc, c= ProteinConformation.objects.get_or_create(protein_id=pr)
                temp['proteins'][row['Uniprot_ACC']] = [pr,pc]

            #fetch residue generic number
            rgnsp=[]


            if(int(row['CGN'].split('.')[2])<10):
                rgnsp = row['CGN'].split('.')
                rgn_new = rgnsp[0]+'.'+rgnsp[1]+'.0'+rgnsp[2]

                if rgn_new in temp['rgn']:
                    rgn = temp['rgn'][rgn_new]
                else:
                    rgn, c= ResidueGenericNumber.objects.get_or_create(label=rgn_new)
                    temp['rgn'][rgn_new] = rgn

            else:

                if row['CGN'] in temp['rgn']:
                    rgn = temp['rgn'][row['CGN']]
                else:
                    rgn, c= ResidueGenericNumber.objects.get_or_create(label=row['CGN'])
                    temp['rgn'][row['CGN']] = rgn

            #fetch protein segment id
            if row['CGN'].split(".")[1] in temp['segment']:
                ps = temp['segment'][row['CGN'].split(".")[1]]
            else:
                ps, c= ProteinSegment.objects.get_or_create(slug=row['CGN'].split(".")[1], proteinfamily='Gprotein')
                temp['segment'][row['CGN'].split(".")[1]] = ps

            try:
                bulk_r = Residue(sequence_number=row['Position'], protein_conformation=pc, amino_acid=row['Residue'], generic_number=rgn, display_generic_number=rgn, protein_segment=ps)
                # self.logger.info("Residues added to db")
                bulk.append(bulk_r)
            except:
                self.logger.error("Failed to add residues")
            if len(bulk) % 10000 == 0:
                self.logger.info('Inserted bulk {} (Index:{})'.format(len(bulk),index))
                # print(len(bulk),"inserts!",index)
                Residue.objects.bulk_create(bulk)
                # print('inserted!')
                bulk = []

             # Add also to the ResidueGenericNumberEquivalent table needed for single residue selection
            try:
                if rgn.label not in temp['equivalent']:
                    ResidueGenericNumberEquivalent.objects.get_or_create(label=rgn.label,default_generic_number=rgn, scheme=cgn_scheme)
                    temp['equivalent'][rgn.label] = 1
                # self.logger.info("Residues added to ResidueGenericNumberEquivalent")

            except:
                self.logger.error("Failed to add residues to ResidueGenericNumberEquivalent")
        self.logger.info('Inserted bulk {} (Index:{})'.format(len(bulk),index))
        Residue.objects.bulk_create(bulk)
Пример #10
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                            defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(name='OTHER')
                        if created:
                            self.logger.info('Created protein source {}'.format(protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type= sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except:
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # create residue records
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[ins_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': ins['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': ins['positions'][1],
                                        'segment': segment_after,
                                    },
                                }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(range((ins['positions'][0] + 1), ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={
                                'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()
Пример #11
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join(
                [self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error(
                            'Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related(
                            'protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(
                                protein__entry_name=sd['protein'],
                                state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error(
                            'Parent protein {} for construct {} not found, aborting!'
                            .format(sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(
                            slug='mod', defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(
                                sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(
                            name='OTHER')
                        if created:
                            self.logger.info(
                                'Created protein source {}'.format(
                                    protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(
                            name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type = sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info(
                            'Created construct {} with parent protein {}'.
                            format(p.name, ppc.protein.entry_name))
                    except:
                        self.logger.error(
                            'Failed creating construct {} with parent protein {}'
                            .format(p.name, ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(
                        slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info(
                            'Created conformation {} of protein {}'.format(
                                pc.state.name, p.name))
                    except:
                        self.logger.error(
                            'Failed creating conformation {} of protein {}'.
                            format(pc.state.name, p.entry_name))

                    # create residue records
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0], t[1] + 1))

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(
                                protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(
                                protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(
                                        slug=slug_1,
                                        defaults={
                                            'name':
                                            ins_start.protein_segment.name,
                                            'category':
                                            ins_start.protein_segment.category,
                                            'partial': True
                                        })
                                    if created:
                                        self.logger.info(
                                            'Created protein segment {}'.
                                            format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(
                                        slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(
                                        slug=slug_2,
                                        defaults={
                                            'name':
                                            ins_start.protein_segment.name,
                                            'category':
                                            ins_start.protein_segment.category,
                                            'partial': True
                                        })
                                    if created:
                                        self.logger.info(
                                            'Created protein segment {}'.
                                            format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(
                                        slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[
                                    ins_start.protein_segment.slug] = {
                                        'start': {
                                            'sequence_number':
                                            ins['positions'][0],
                                            'segment': segment_before,
                                        },
                                        'end': {
                                            'sequence_number':
                                            ins['positions'][1],
                                            'segment': segment_after,
                                        },
                                    }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(
                                    range((ins['positions'][0] + 1),
                                          ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(
                                name=ins['name'],
                                defaults={'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(
                                protein=p,
                                protein_fusion=fusion,
                                segment_before=segment_before,
                                segment_after=segment_after)

                    prs = Residue.objects.filter(
                        protein_conformation=ppc).prefetch_related(
                            'protein_conformation__protein', 'protein_segment',
                            'generic_number', 'display_generic_number__scheme',
                            'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug][
                                    'start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug][
                                    'end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[
                                        pr.protein_segment.
                                        slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[
                                        pr.protein_segment.
                                        slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number][
                                        'wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[
                                        r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()