Python Residue примеры использования

Язык программирования: Python

Пространство имен/Пакет: residue.models

Класс/Тип: Residue

Примеров на hotexamples.com: 11

Python Residue - 11 примеров найдено. Это лучшие примеры Python кода для residue.models.Residue, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Residue(10)

amino_acid(8)

display_generic_number(8)

generic_number(8)

protein_conformation(8)

protein_segment(8)

sequence_number(8)

save(6)

Пример #1

Показать файл

def create_or_update_residue(protein_conformation, segment, schemes, residue,
                             b_and_c):
    logger = logging.getLogger('build')

    rns_defaults = {
        'protein_segment': segment
    }  # default numbering scheme for creating generic numbers

    # default numbering scheme
    ns = settings.DEFAULT_NUMBERING_SCHEME
    ns_obj = ResidueNumberingScheme.objects.get(slug=ns)

    rvalues = {}
    rvalues['protein_segment'] = segment
    rvalues['amino_acid'] = residue['aa']
    rvalues['generic_number'] = None
    rvalues['display_generic_number'] = None
    sequence_number = residue['pos']
    numbers = residue['numbers']

    if 'generic_number' in numbers:
        numbers = format_generic_numbers(
            protein_conformation.protein.residue_numbering_scheme, schemes,
            sequence_number, numbers['generic_number'], numbers['bw'], b_and_c)
        # print(numbers)
    # print(residues,numbers)
    # main generic number
    if 'generic_number' in numbers:
        gnl = numbers['generic_number']
        if gnl in schemes[ns]['generic_numbers']:
            rvalues['generic_number'] = schemes[ns]['generic_numbers'][gnl]
        else:
            try:
                gn, created = ResidueGenericNumber.objects.get_or_create(
                    scheme=ns_obj, label=gnl, defaults=rns_defaults)
                # if created:
                #     logger.info('Created generic number {}'.format(gn.label))
            except IntegrityError:
                gn = ResidueGenericNumber.objects.get(scheme=ns_obj, label=gnl)
            rvalues['generic_number'] = schemes[ns]['generic_numbers'][
                gnl] = gn

    # equivalent to main generic number
    if 'equivalent' in numbers:
        try:
            gn_equivalent, created = ResidueGenericNumberEquivalent.objects.get_or_create(
                default_generic_number=rvalues['generic_number'],
                scheme=protein_conformation.protein.residue_numbering_scheme,
                defaults={'label': numbers['equivalent']})
            # if created:
            #     logger.info('Created generic number equivalent {} ({}) for scheme {}'.format(
            #         numbers['equivalent'], numbers['generic_number'],
            #         protein_conformation.protein.residue_numbering_scheme))
        except IntegrityError:
            gn_equivalent = ResidueGenericNumberEquivalent.objects.get(
                default_generic_number=rvalues['generic_number'],
                scheme=protein_conformation.protein.residue_numbering_scheme)

    # display generic number
    if 'display_generic_number' in numbers:
        ns = protein_conformation.protein.residue_numbering_scheme.slug
        gnl = numbers['display_generic_number']
        if gnl in schemes[ns]['generic_numbers']:
            rvalues['display_generic_number'] = schemes[ns]['generic_numbers'][
                gnl]
        else:
            try:
                gn, created = ResidueGenericNumber.objects.get_or_create(
                    scheme=protein_conformation.protein.
                    residue_numbering_scheme,
                    label=gnl,
                    defaults=rns_defaults)
                # if created:
                #     logger.info('Created display generic number {}'.format(gn.label))
            except IntegrityError:
                gn = ResidueGenericNumber.objects.get(
                    scheme=protein_conformation.protein.
                    residue_numbering_scheme,
                    label=gnl)
            rvalues['display_generic_number'] = schemes[ns]['generic_numbers'][
                gnl] = gn

        # UPDATE or CREATE the residue
    # bulk_r = Residue(protein_conformation=protein_conformation,sequence_number=sequence_number,defaults = rvalues)
    bulk_r = Residue(protein_conformation=protein_conformation,
                     sequence_number=sequence_number,
                     amino_acid=rvalues['amino_acid'],
                     display_generic_number=rvalues['display_generic_number'],
                     generic_number=rvalues['generic_number'],
                     protein_segment=segment)
    # r, created = Residue.objects.update_or_create(protein_conformation=protein_conformation,
    #     sequence_number=sequence_number, defaults = rvalues)

    # alternative generic numbers
    # r.alternative_generic_numbers.clear() # remove any existing relations
    bulk_add_alt = []
    if (numbers and 'alternative_generic_numbers' in numbers):
        for alt_scheme, alt_num in numbers[
                'alternative_generic_numbers'].items():
            if alt_num in schemes[alt_scheme]['generic_numbers']:
                argn = schemes[alt_scheme]['generic_numbers'][alt_num]
            else:
                try:
                    argn, created = ResidueGenericNumber.objects.get_or_create(
                        scheme=ResidueNumberingScheme.objects.get(
                            slug=alt_scheme),
                        label=alt_num,
                        defaults=rns_defaults)
                except IntegrityError:
                    argn = ResidueGenericNumber.objects.get(
                        scheme=ResidueNumberingScheme.objects.get(
                            slug=alt_scheme),
                        label=alt_num)
                schemes[alt_scheme]['generic_numbers'][alt_num] = argn
            try:
                bulk_add_alt.append(argn)
                # r.alternative_generic_numbers.add(argn)
            except IntegrityError:
                print('argn already added?')
                pass
                # print('argn already added?')

    return [bulk_r, bulk_add_alt]

Пример #2

Показать файл

Файл: build_g_protein_structures.py Проект: zy342245222/protwis

    def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()

        # Building protein and protconf objects for g protein structure in complex
        scs = SignprotComplex.objects.all()
        for sc in scs:
            self.logger.info(
                'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                .format(sc))
            try:
                # Alpha subunit
                try:
                    alpha_protein = Protein.objects.get(
                        entry_name=sc.structure.pdb_code.index.lower() + '_a')
                except:
                    alpha_protein = Protein()
                    alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.accession = None
                    alpha_protein.name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.sequence = sc.protein.sequence
                    alpha_protein.family = sc.protein.family
                    alpha_protein.parent = sc.protein
                    alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                    alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                        slug='mod')
                    alpha_protein.source = ProteinSource.objects.get(
                        name='OTHER')
                    alpha_protein.species = sc.protein.species
                    alpha_protein.save()
                try:
                    alpha_protconf = ProteinConformation.objects.get(
                        protein__entry_name=sc.structure.pdb_code.index.lower(
                        ) + '_a')
                except:
                    alpha_protconf = ProteinConformation()
                    alpha_protconf.protein = alpha_protein
                    alpha_protconf.state = ProteinState.objects.get(
                        slug='active')
                    alpha_protconf.save()
                pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                s = pdbp.get_structure('struct',
                                       StringIO(sc.structure.pdb_data.pdb))
                chain = s[0][sc.alpha]
                nums = []
                for res in chain:
                    try:
                        res['CA']
                        nums.append(res.get_id()[1])
                    except:
                        pass

                resis = Residue.objects.filter(
                    protein_conformation__protein=sc.protein)
                num_i = 0
                temp_seq2 = ''
                pdb_num_dict = OrderedDict()
                # Create first alignment based on sequence numbers
                for n in nums:
                    if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                        nr = n + 6
                    else:
                        nr = n
                    pdb_num_dict[n] = [chain[n], resis.get(sequence_number=nr)]
                # Find mismatches
                mismatches = []
                for n, res in pdb_num_dict.items():
                    if AA[res[0].get_resname()] != res[1].amino_acid:
                        mismatches.append(res)

                pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                seqadv = []
                for l in pdb_lines:
                    if l.startswith('SEQADV'):
                        seqadv.append(l)
                mutations, shifted_mutations = OrderedDict(), OrderedDict()
                # Search for annotated engineered mutations in pdb SEQADV
                for s in seqadv:
                    line_search = re.search(
                        'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                        s)
                    if line_search != None:
                        if line_search.group(2) == sc.alpha:
                            if line_search.group(
                                    4).strip() == sc.protein.accession:
                                if line_search.group(3) == line_search.group(
                                        6):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                else:
                                    shifted_mutations[int(
                                        line_search.group(3))] = [
                                            line_search.group(1),
                                            line_search.group(5),
                                            int(line_search.group(6))
                                        ]
                            else:
                                # Exception for 6G79
                                if line_search.group(3) != line_search.group(
                                        6) and 'CONFLICT' in line_search.group(
                                            7):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                # Exception for 5G53
                                if line_search.group(
                                        4).strip() != sc.protein.accession:
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                remaining_mismatches = []

                # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                for m in mismatches:
                    num = m[0].get_id()[1]
                    if num in mutations:
                        if m[0].get_resname() != mutations[num][0] and m[
                                1].amino_acid != AA[mutations[num][1]]:
                            remaining_mismatches.append(m)
                    elif num in shifted_mutations:
                        remaining_mismatches.append(m)
                    else:
                        remaining_mismatches.append(m)

                ### sanity check
                # print(mutations)
                # print(shifted_mutations)
                # print(mismatches)
                # print(remaining_mismatches)
                # pprint.pprint(pdb_num_dict)

                # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                if len(remaining_mismatches
                       ) > 0 and sc.structure.pdb_code.index not in [
                           '6OIJ', '6OY9', '6OYA'
                       ]:
                    ppb = PPBuilder()
                    seq = ''
                    for pp in ppb.build_peptides(chain, aa_only=False):
                        seq += str(pp.get_sequence())
                    pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2,
                                                  -1, -.5, -.1)
                    ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                    wt_pdb_dict = OrderedDict()
                    pdb_wt_dict = OrderedDict()
                    j, k = 0, 0
                    for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq,
                                            temp_seq):
                        if ref != '-' and temp != '-':
                            wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = resis[j]
                            j += 1
                            k += 1
                        elif ref == '-':
                            wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                            k += 1
                        elif temp == '-':
                            wt_pdb_dict[resis[j]] = i
                            pdb_wt_dict[i] = resis[j]
                            j += 1
                    for i, r in enumerate(remaining_mismatches):
                        # Adjust for shifted residue when residue is a match
                        if r[0].get_id()[1] - remaining_mismatches[
                                i - 1][0].get_id()[1] > 1:
                            pdb_num_dict[r[0].get_id()[1] -
                                         1][1] = pdb_wt_dict[chain[
                                             r[0].get_id()[1] - 1]]
                        # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                        if r[0].get_id()[1] in shifted_mutations:
                            pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                sequence_number=shifted_mutations[
                                    r[0].get_id()[1]][2])
                        # Adjust for shift
                        else:
                            pdb_num_dict[r[0].get_id()[1]][1] = pdb_wt_dict[
                                r[0]]

                bulked_residues = []
                for key, val in pdb_num_dict.items():
                    # print(key, val) # sanity check
                    res_obj = Residue()
                    res_obj.sequence_number = val[0].get_id()[1]
                    res_obj.amino_acid = AA[val[0].get_resname()]
                    res_obj.display_generic_number = val[
                        1].display_generic_number
                    res_obj.generic_number = val[1].generic_number
                    res_obj.protein_conformation = alpha_protconf
                    res_obj.protein_segment = val[1].protein_segment
                    bulked_residues.append(res_obj)
                Residue.objects.bulk_create(bulked_residues)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                    .format(sc))
            except Exception as msg:
                print(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))
                print(msg)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))

Пример #3

Показать файл

Файл: build_constructs.py Проект: GPCRmd/GPCRmd

    def create_constructs(self, filenames):
        self.logger.info('CREATING CONSTRUCTS')
        
        # what files should be parsed?
        if not filenames:
            filenames = os.listdir(self.construct_data_dir)

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.select_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                        defaults={'name': 'Modified'})
                    p.source, created = ProteinSource.objects.get_or_create(name='OTHER')
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence
                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except Exception as e:
                        print(e)
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # create residue records
                    deletions = []
                    deletions_list = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))
                            deletions_list.append(str(t[0])+'-'+str(t[1])) 
                    s = ","
                    deletion_string = s.join(deletions_list)
                         

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = m[1:-1]
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }
                    
                    # Create construct record
                    c = Construct()            
                    c.protein_conformation = pc
                    c.deletions =  deletion_string
                    c.save()
                      

                    # Create Auxiliary proteins
#                    if 'auxiliary_proteins' in sd and sd['auxiliary_proteins']:
#                        ap = AuxProtein()
#                        ap.construct = c
#                        apct = AuxProteinType.objects.create()
                       # ap.protein_type = apct 
#                        apct.save()
#                        if 'remarks' in sd['auxiliary_proteins']:
#                            ap.remarks = sd['auxiliary_proteins']['remarks']
#                        ap.save()
 

#                        for step in sd['auxiliary_proteins']:
#                            if 'type' in step and 'name' in step and'sequence' in step:
#                                ap.protein_type = apct
                 #              ap.protein_type, created = AuxProteinType.objects.get_or_create()
#                                ap.name = sd['auxiliary_proteins']['name']
#                                ap.uniprot_id = sd['auxiliary_proteins']['uniprot_id']
#                                ap.sequence = sd['auxiliary_proteins']['sequence']
                                #mutations if any to be included from mutation model along with reason of mutation
#                                ap.position = sd['auxiliary_proteins']['position']
#                                ap.deletions = sd['auxiliary_proteins']['deletions']
                                
#                            else:
#                                self.logger.error('Auxiliary protein step incorrectly defined for {}'.format(p))



                     # create expression records
                    if 'expression_sys' in sd and sd['expression_sys']:
                        ce = ConstructExpression()           
                        ce.construct = c
                        ce.expression_system, created = ConstructExpressionSystem.objects.get_or_create(expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell'])
                        if 'remarks' in sd:
                            ce.remarks = sd['expression_sys']['remarks']
                        ce.save()
 
               
                    # create solubilization records
                    if 'solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']:
                        so = ConstructSolubilization()
                        so.construct = c
                        cl = ChemicalList.objects.create()
                        so.chemical_list = cl 

                        for step in sd['solubilization']['steps']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                                chem.name =  step['item']
                                chem.save()

                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Solubilization step incorrectly defined for {}'.format(p))                                 

                        if 'remarks' in sd['solubilization']:
                            so.remarks = sd['solubilization']['remarks']
                        so.save()



                    # create  purification records
                    if 'purification' in sd and sd['purification'] and sd['purification']['steps']:
                        pu = ConstructPurification()
                        pu.construct = c
                        if 'remarks' in sd['purification']:
                            pu.remarks = sd['purification']['remarks']
                        pu.save() 
                        for step in sd['purification']['steps']:
                            if 'type' in step and 'description' in step:
                                pust = PurificationStep()
                                pust.description = step['description']
                                pust.purification = pu
                                pust.purification_type, created = PurificationStepType.objects.get_or_create(name = step['type'] ) # 2 values returned by get_or_create
                                if created: 
                                    self.logger.info('Created purification step type {}'.format(pust.purification_type))
                                pust.save()

                            else:
                                self.logger.error('Purification step incorrectly defined for {}'.format(p))

                        


                   # create crystallization records
                    if 'crystallization' in sd and sd['crystallization']: 
                        cy = ConstructCrystallization()
                        cy.construct = c
                        cyt = CrystallizationMethodTypes.objects.create()
                        cy.crystal_type = cyt
                        cy.method = sd['crystallization']['method']
                        cy.settings = sd['crystallization']['settings']
                        cy.protein_conc = sd['crystallization']['protein_conc']
                        cl = ChemicalList.objects.create()
                        cy.chemical_list = cl   

                        for step in sd['crystallization']['chemicallist']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                                
                                chem.name =  step['item']
                                chem.save()
                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Crystallization step incorrectly defined for {}'.format(p))                        

                        cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP']
                        cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume']
                        cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume']
                        cy.temp = sd['crystallization']['temperature']
                        cy.ph = sd['crystallization']['ph']  


                        if 'remarks' in sd['crystallization']:
                            cy.remarks = sd['crystallization']['remarks']
                        cy.save()

                                     
                    # fusion proteins
                    split_segments = {}
                    if 'fusion_proteins' in sd and sd['fusion_proteins']:
                        for fp in sd['fusion_proteins']:
                            fp_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=fp['positions'][0])
                            fp_end = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][1])
                            # if the fusion protein is inserted within only one segment (the usual case), split that
                            # segment into two segments
                            if fp_start and fp_start.protein_segment == fp_end.protein_segment:
                                # get/create split protein segments
                                segment_before, created = ProteinSegment.objects.get_or_create(
                                    slug=fp_start.protein_segment.slug+"_1", defaults={
                                    'name': fp_start.protein_segment.name,
                                    'category': fp_start.protein_segment.category,
                                    'partial': True})
                                segment_after, created = ProteinSegment.objects.get_or_create(
                                    slug=fp_start.protein_segment.slug+"_2", defaults={
                                    'name': fp_start.protein_segment.name,
                                    'category': fp_start.protein_segment.category,
                                    'partial': True})

                                # keep track of  information about split segments
                                split_segments[fp_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': fp['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': fp['positions'][1],
                                        'segment': segment_after,
                                    },
                                }

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=fp['name'], defaults={
                                'sequence': fp['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number
                            
                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)
                    
                    # update sequence
                    p.sequence = updated_sequence
                    p.save()

        self.logger.info('COMPLETED CREATING CONSTRUCTS')

Пример #4

Показать файл

Файл: build_g_protein_structures.py Проект: marghederiu/protwis

    def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options['only_signprot_structures']:
            # Building protein and protconf objects for g protein structure in complex
            scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            '_a')
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')
                        alpha_protein.source = ProteinSource.objects.get(
                            name='OTHER')
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + '_a')
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug='active')
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure('struct',
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        try:
                            res['CA']
                            nums.append(res.get_id()[1])
                        except:
                            pass

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ''
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith('SEQADV'):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and 'CONFLICT' in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    ### sanity check
                    # print(sc)
                    # print(mutations)
                    # print(shifted_mutations)
                    # print(mismatches)
                    # print('======')
                    # print(remaining_mismatches)
                    # pprint.pprint(pdb_num_dict)

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(remaining_mismatches
                           ) > 0 and sc.structure.pdb_code.index not in [
                               '6OIJ', '6OY9', '6OYA', '6LPB', '6WHA'
                           ]:
                        ppb = PPBuilder()
                        seq = ''
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        pw2 = pairwise2.align.localms(sc.protein.sequence, seq,
                                                      2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            # print(i, ref, temp) # alignment check
                            if ref != '-' and temp != '-':
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == '-':
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == '-':
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        for i, r in enumerate(remaining_mismatches):
                            # Adjust for shifted residue when residue is a match
                            if r[0].get_id()[1] - remaining_mismatches[
                                    i - 1][0].get_id()[1] > 1:
                                pdb_num_dict[r[0].get_id()[1] -
                                             1][1] = pdb_wt_dict[chain[
                                                 r[0].get_id()[1] - 1]]
                            # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                            if r[0].get_id()[1] in shifted_mutations:
                                pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                    sequence_number=shifted_mutations[
                                        r[0].get_id()[1]][2])
                            # Adjust for shift
                            else:
                                pdb_num_dict[r[0].get_id()
                                             [1]][1] = pdb_wt_dict[r[0]]
                    # Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    # elif sc.structure.pdb_code.index=='6WHA':
                    #     ref_seq  = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMY--SHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                    #     temp_seq = '----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV'
                    #     for i, ref, temp in zip(range(0,len(ref_seq)), ref_seq, temp_seq):
                    #         print(i, ref, temp)
                    #     pprint.pprint(pdb_num_dict)

                    bulked_residues = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            bulked_residues.append(res_obj)
                        else:
                            self.logger.info(
                                'Skipped {} as no annotation was present, while building for alpha subunit of {}'
                                .format(val[1], sc))

                    Residue.objects.bulk_create(bulked_residues)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                        .format(sc))
                except Exception as msg:
                    #print('Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'.format(sc))
                    #print(msg)
                    #print(traceback.format_exc())
                    #exit(0)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                        .format(sc))

        ### Build SignprotStructure objects from non-complex signprots
        g_prot_alphas = Protein.objects.filter(
            family__slug__startswith='100_001',
            accession__isnull=False)  #.filter(entry_name='gnai1_human')
        complex_structures = SignprotComplex.objects.all().values_list(
            'structure__pdb_code__index', flat=True)
        for a in g_prot_alphas:
            pdb_list = get_pdb_ids(a.accession)
            for pdb in pdb_list:
                if pdb not in complex_structures:
                    try:
                        data = self.fetch_gprot_data(pdb, a)
                        if data:
                            self.build_g_prot_struct(a, pdb, data)
                    except Exception as msg:
                        self.logger.error(
                            'SignprotStructure of {} {} failed\n{}: {}'.format(
                                a.entry_name, pdb, type(msg), msg))

Пример #5

Показать файл

    def create_residues(self, args):

        schemes = {
            'gpcrdb': {'type': False},
            'gpcrdba': {
                'type': 'structure',
                'seq_based': 'bw',
            },
            'gpcrdbb': {
                'type': 'structure',
                'seq_based': 'woot',
            },
            'gpcrdbc': {
                'type': 'structure',
                'seq_based': 'pin',
            },
            'gpcrdbf': {
                'type': 'structure',
                'seq_based': 'wang',
            },
            'bw': {'type': 'sequence'},
            'woot': {'type': 'sequence'},
            'pin': {'type': 'sequence'},
            'wang': {'type': 'sequence'},
        }

        for scheme_name, scheme in schemes.items():
            schemes[scheme_name]['obj'] = ResidueNumberingScheme.objects.get(slug=scheme_name)
            mapping_file = os.sep.join([self.generic_numbers_source_dir, 'mapping_' + scheme_name + '.txt'])
            if os.path.isfile(mapping_file):
                with open(mapping_file, "r", encoding='UTF-8') as scheme_table_file:
                    schemes[scheme_name]['table'] = {}
                    for row in scheme_table_file:
                        split_row = shlex.split(row)
                        schemes[scheme_name]['table'][split_row[0]] = split_row[1]
        missing_proteins = []
        self.logger.info('CREATING RESIDUES')
        for arg in args:
            if os.path.exists(os.sep.join([self.dump_source_dir, arg])):
                residue_data_fh = open(os.sep.join([self.dump_source_dir, arg]), 'r')
                self.logger.info('Parsing residue data from {}'.format(arg))
            else:
                print("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                self.logger.error("Failed to open file {!s}".format(os.sep.join([self.dump_source_dir, arg])))
                continue
            for line in residue_data_fh:
                id,res_num,res_name,oli,gpcrdb,bw,bw2,bs,prot_name,sec_str_name = [x.strip().strip('"') for x in line.split(',')] #double strip due to some weird bug...
                if prot_name in missing_proteins:
                    continue
                
                # fetch schemes and conversion tables
                #Checking if the protein exists in the db
                try:
                    pconf = ProteinConformation.objects.get(protein__entry_name=prot_name,
                        state__slug=settings.DEFAULT_PROTEIN_STATE)
                except ProteinConformation.DoesNotExist as e:
                    missing_proteins.append(prot_name)
                    continue
                #Checking if given residue already exists in the db
                try:
                    Residue.objects.get(protein_conformation=pconf.id, sequence_number=res_num)
                    continue
                except Residue.DoesNotExist as e:
                    pass

                r = Residue()
                r.protein_conformation = pconf
                r.sequence_number = int(res_num)
                r.amino_acid = polypeptide.three_to_one(res_name.upper())
                
                generic_numbers = []
                
                try:
                    r.save()
                    self.logger.info('Created residue {:n}{!s} for protein {!s}'.format(r.sequence_number,
                        r.amino_acid, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error('Failed to create residue {:n}{!s} for protein {!s}'.format(
                        r.sequence_number, r.amino_acid, pconf.protein.entry_name))
                    continue
                  
                # residue segment
                dump_segment = sec_str_name
                try:
                    r.protein_segment = ProteinSegment.objects.get(slug=dump_segment)
                except:
                    self.logger.error('Failed to fetch protein segment {}'.format(dump_segment))

                # generic number
                if (str(oli) != '0' and gpcrdb != 'None' and bw != 'None'):
                    # separate bulge number (1241 - > 124 + 1)
                    bulge_prime = ''
                    dump_oliveira = str(oli)
                    if len(dump_oliveira) == 4:
                        bulge_prime = dump_oliveira[3]
                        dump_oliveira = dump_oliveira[:3]
                    dump_gpcrdb = gpcrdb[:4]
                    dump_seq_based = bw

                    # default gpcrdb number
                    def_gpcrdb = False
                    if dump_oliveira in schemes[settings.DEFAULT_NUMBERING_SCHEME]['table']:
                        default_label = (schemes[settings.DEFAULT_NUMBERING_SCHEME]['table'][dump_oliveira] + 
                            bulge_prime)
                        try:
                            def_gpcrdb = ResidueGenericNumber.objects.get(label=default_label,
                                scheme=schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'])
                        except ResidueGenericNumber.DoesNotExist as e:
                            def_gpcrdb = ResidueGenericNumber()
                            def_gpcrdb.label = default_label
                            def_gpcrdb.scheme = schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj']
                            def_gpcrdb.protein_segment = r.protein_segment
                            def_gpcrdb.save()
                            self.logger.info('Created generic number {:s} in numbering scheme {:s}'
                                .format(default_label,
                                schemes[settings.DEFAULT_NUMBERING_SCHEME]['obj'].short_name))
                                    
                    # if default number was found/added successfully, process the alternative numbers
                    if def_gpcrdb:
                        # add default generic number to residue record
                        r.generic_number = def_gpcrdb

                        # dict of sequence-based numbers, for use in structure-based numbers (5.46x461)
                        seq_based_labels = {}

                        # sequence-based schemes first (the sequence-based numbers are needed for the
                        # structure based schemes)
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'sequence':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == schemes[pconf.protein.residue_numbering_scheme.slug]['seq_based']:
                                    seq_based_label = dump_seq_based
                                # if not convert the number to the correct scheme
                                else:
                                    slug = pconf.protein.residue_numbering_scheme.slug
                                    for d, c in schemes[schemes[slug]['seq_based']]['table'].items():
                                        if c == dump_seq_based:
                                            seq_based_label = scheme['table'][d]
                                            break

                                # fetch/insert the number
                                try:
                                    seq_based = ResidueGenericNumber.objects.get(label=seq_based_label,
                                        scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    seq_based = ResidueGenericNumber()
                                    seq_based.label = seq_based_label
                                    seq_based.scheme = scheme['obj']
                                    seq_based.protein_segment = r.protein_segment
                                    seq_based.save()
                                r.alternative_generic_numbers.add(seq_based)

                                # add added number to the dict for later use
                                seq_based_labels[scheme_name] = seq_based_label
                                                
                        # structure-based numbers
                        for scheme_name, scheme in schemes.items():
                            if scheme['type'] == 'structure':
                                # is this number in the scheme defined for this protein?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    struct_based_label = dump_gpcrdb + bulge_prime
                                # if not convert the number to the correct scheme
                                else:
                                    for d, c in schemes[pconf.protein.residue_numbering_scheme.slug]['table'].items():
                                        if c == dump_gpcrdb:
                                            struct_based_label = scheme['table'][d] + bulge_prime
                                            break

                                # add the sequence-based label (5x461 -> 5.46x461)
                                split_struct_based_label = struct_based_label.split('x')
                                struct_based_label = (seq_based_labels[scheme['seq_based']] + 'x' +
                                    split_struct_based_label[1])

                                # fetch/insert the number
                                try:
                                    struct_based = ResidueGenericNumber.objects.get(
                                        label=struct_based_label, scheme=scheme['obj'])
                                except ResidueGenericNumber.DoesNotExist as e:
                                    struct_based = ResidueGenericNumber()
                                    struct_based.label = struct_based_label
                                    struct_based.scheme = scheme['obj']
                                    struct_based.protein_segment = r.protein_segment
                                    struct_based.save()
                                                
                                # add to residue as a display number or alternative number?
                                if scheme_name == pconf.protein.residue_numbering_scheme.slug:
                                    r.display_generic_number = struct_based
                                else:
                                    r.alternative_generic_numbers.add(struct_based)
                try:
                    r.save()
                    self.logger.info('Added generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                        res_name, pconf.protein.entry_name))
                except Exception as msg:
                    print(msg)
                    self.logger.error(
                        'Failed to create generic numbers for residue {}{!s} for protein {!s}'.format(res_num,
                            res_name, pconf.protein.entry_name))
        self.logger.info('COMPLETED CREATING RESIDUES')

Пример #6

Показать файл

    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                            defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(name='OTHER')
                        if created:
                            self.logger.info('Created protein source {}'.format(protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type= sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except:
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # process deletions (save in db, and for sequence processing)
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))
                            deletion = ConstructDeletion.objects.create(construct=pc, start=t[0], end=t[1])
                            if created:
                                self.logger.info('Created deletion {}-{} for {}'.format(t[0], t[1],
                                    pc.protein.entry_name))

                    # process mutations (save in db, and for sequence processing)
                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }
                            mutation = ConstructMutation.objects.get_or_create(
                                construct=pc,
                                sequence_number=res_num,
                                wild_type_amino_acid=m[0],
                                mutated_amino_acid=m[-1],
                            )

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[ins_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': ins['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': ins['positions'][1],
                                        'segment': segment_after,
                                    },
                                }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(range((ins['positions'][0] + 1), ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={
                                'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    # create expression records
                    if 'expression_sys' in sd and sd['expression_sys']:
                        ce = Expression()           
                        ce.construct = pc
                        ce.expression_system, created = ExpressionSystem.objects.get_or_create(
                            expression_method = sd['expression_sys']['expression_method'],
                            host_cell_type = sd['expression_sys']['host_cell_type'],
                            host_cell = sd['expression_sys']['host_cell'])
                        if 'remarks' in sd:
                           ce.remarks = sd['expression_sys']['remarks']
                        ce.save()
                    
                    # create solubilization records
                    if ('solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization']
                        and sd['solubilization']['steps']):
                        so = Solubilization()
                        so.construct = pc
                        cl = ChemicalList.objects.create()
                        so.chemical_list = cl 

                        for step in sd['solubilization']['steps']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type'])
                                chem.name =  step['item']
                                chem.save()

                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Solubilization step incorrectly defined for {}'.format(p))

                        if 'remarks' in sd['solubilization']:
                            so.remarks = sd['solubilization']['remarks']
                        so.save()
                    
                    # create  purification records
                    if 'purification' in sd and sd['purification'] and sd['purification']['steps']:
                        pu = Purification()
                        pu.construct = pc
                        if 'remarks' in sd['purification']:
                            pu.remarks = sd['purification']['remarks']
                        pu.save() 
                        for step in sd['purification']['steps']:
                            if 'type' in step and 'description' in step:
                                pust = PurificationStep()
                                pust.description = step['description']
                                pust.purification = pu
                                pust.purification_type, created = PurificationStepType.objects.get_or_create(
                                    name = step['type'] ) # 2 values returned by get_or_create
                                if created: 
                                    self.logger.info('Created purification step type {}'.format(
                                        pust.purification_type))
                                pust.save()

                            else:
                                self.logger.error('Purification step incorrectly defined for {}'.format(p))
                    
                    # create crystallization records
                    if 'crystallization' in sd and sd['crystallization']: 
                        cy = Crystallization()
                        cy.construct = pc
                        cyt = CrystallizationMethodTypes.objects.create()
                        cy.crystal_type = cyt
                        cy.method = sd['crystallization']['method']
                        cy.settings = sd['crystallization']['settings']
                        cy.protein_conc = sd['crystallization']['protein_conc']
                        cl = ChemicalList.objects.create()
                        cy.chemical_list = cl

                        for step in sd['crystallization']['chemicallist']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 

                                chem.name =  step['item']
                                chem.save()
                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()

                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Crystallization step incorrectly defined for {}'.format(p))                        

                        cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP']
                        cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume']
                        cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume']
                        cy.temp = sd['crystallization']['temperature']
                        cy.ph = sd['crystallization']['ph']  


                        if 'remarks' in sd['crystallization']:
                            cy.remarks = sd['crystallization']['remarks']
                        cy.save()
                    
                    # create residues
                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()

Пример #7

Показать файл

    def create_rotamers(self, structure, pdb_path):
        wt_lookup = {} #used to match WT seq_number to WT residue record
        pdbseq = {} #used to keep track of pdbseq residue positions vs index in seq
        ref_positions = {} #WT postions in alignment
        mapped_seq = {} # index in contruct, tuple of AA and WT [position,AA]

        preferred_chain = structure.preferred_chain

        if len(preferred_chain.split(','))>1: #if A,B
            preferred_chain = preferred_chain.split(',')[0]


        AA = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D',
     'CYS':'C', 'GLN':'Q', 'GLU':'E', 'GLY':'G',
     'HIS':'H', 'ILE':'I', 'LEU':'L', 'LYS':'K',
     'MET':'M', 'PHE':'F', 'PRO':'P', 'SER':'S',
     'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'}


        s = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', pdb_path)[0]
        chain = s[preferred_chain] #select only one chain (avoid n-mer receptors)
        ppb=PPBuilder()
        seq = ''
        i = 1

        check_1000 = 0
        for pp in ppb.build_peptides(chain): #remove >1000 pos (fusion protein / gprotein)
            for res in pp:
                id = res.id
                if id[1]<600: 
                    check_1000 += 1
                    #need check_1000 to catch structures where they lie in 1000s (4LDE, 4LDL, 4LDO, 4N4W, 4QKX)
                if id[1]>1000 and check_1000>200: 
                    chain.detach_child(id)

        for pp in ppb.build_peptides(chain): 
            seq += str(pp.get_sequence()) #get seq from fasta (only chain A)
            for residue in pp:
                residue_id = residue.get_full_id()
                chain = residue_id[2]
                if chain not in pdbseq:
                    pdbseq[chain] = {}
                pos = residue_id[3][1]
                pdbseq[chain][pos] = [i,AA[residue.resname]]
                i += 1

        parent_seq = str(structure.protein_conformation.protein.parent.sequence)

        rs = Residue.objects.filter(protein_conformation__protein=structure.protein_conformation.protein.parent).prefetch_related('display_generic_number','generic_number','protein_segment')

        for r in rs: #required to match WT position to a record (for duplication of GN values)
            wt_lookup[r.sequence_number] = r

        #align WT with structure seq -- make gaps penalties big, so to avoid too much overfitting
        pw2 = pairwise2.align.localms(parent_seq, seq, 2, -4, -4, -.1)

        gaps = 0
        unmapped_ref = {}
        for i, r in enumerate(pw2[0][0], 1): #loop over alignment to create lookups (track pos)
            #print(i,r,pw2[0][1][i-1]) #print alignment for sanity check
            if r == "-":
                gaps += 1
            if r != "-":
                ref_positions[i] = [i-gaps,r]
            elif r == "-":
                ref_positions[i] = [None,'-']

            if pw2[0][1][i-1]=='-':
                unmapped_ref[i-gaps] = '-'

        gaps = 0
        for i, r in enumerate(pw2[0][1], 1): #make second lookup
            if r == "-":
                gaps += 1
            if r != "-":
                mapped_seq[i-gaps] = [r,ref_positions[i]]


        pdb = structure.pdb_data.pdb
        protein_conformation=structure.protein_conformation
        temp = ''
        check = 0
        errors = 0
        mismatch_seq = 0
        match_seq = 0
        not_matched = 0
        matched_by_pos = 0
        aa_mismatch = 0

        pdblines_temp = pdb.splitlines()
        pdblines = []
        for line in pdblines_temp: #Get rid of all odd records
            if line.startswith('ATOM'):
                pdblines.append(line)
        pdblines.append('') #add a line to not "run out"

        for i,line in enumerate(pdblines):
            if line.startswith('ATOM'): 
                chain = line[21]
                if preferred_chain and chain!=preferred_chain: #If perferred is defined and is not the same as the current line, then skip
                    pass
                else:   
                    nextline = pdblines[i+1]
                    residue_number = line[22:26].strip()
                    if (check==0 or nextline[22:26].strip()==check) and nextline.startswith('TER')==False and nextline.startswith('ATOM')==True: #If this is either the begining or the same as previous line add to current rotamer
                        temp += line + "\n"
                        #print('same res',pdb.splitlines()[i+1])
                    else: #if this is a new residue
                        #print(pdb.splitlines()[i+1][22:26].strip(),check)
                        temp += line + "\n"
                        if int(check.strip())<2000:
                            residue = Residue()
                            residue.sequence_number = int(check.strip())
                            residue.amino_acid = AA[residue_name.upper()]
                            residue.protein_conformation = protein_conformation

                            #print(residue.sequence_number,residue.amino_acid) #sanity check
                            try:
                                seq_num_pos = pdbseq[chain][residue.sequence_number][0]
                            except:
                                #print('failed residue',pdb_path,residue.sequence_number)
                                temp = "" #start new line for rotamer
                                check = pdblines[i+1][22:26].strip()
                                continue
                            if seq_num_pos in mapped_seq:
                                if mapped_seq[seq_num_pos][1][0]==None:
                                    #print('no match found') #sanity check
                                    #print(residue.sequence_number,residue.amino_acid) #sanity check
                                    residue.display_generic_number = None
                                    residue.generic_number = None
                                    residue.protein_segment = None
                                    not_matched +=1
                                else:
                                    wt_r = wt_lookup[mapped_seq[seq_num_pos][1][0]]
                                    if residue.sequence_number!=wt_r.sequence_number and residue.amino_acid!=wt_r.amino_acid and residue.sequence_number in wt_lookup: #if pos numbers not work -- see if the pos number might be in WT and unmapped
                                        if wt_lookup[residue.sequence_number].amino_acid==residue.amino_acid:
                                            if residue.sequence_number in unmapped_ref: #WT was not mapped, so could be it
                                               # print(residue.sequence_number,residue.amino_acid) #sanity check
                                                #print('wrongly matched, better match on pos+aa',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                                wt_r = wt_lookup[residue.sequence_number]
                                                matched_by_pos +=1
                                                match_seq += 1
                                            else:
                                                mismatch_seq += 1
                                                #print('could have been matched, but already aligned to another position',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        else:
                                            #print('WT pos not same AA, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                            mismatch_seq += 1
                                    elif residue.sequence_number!=wt_r.sequence_number:
                                        #print('WT pos not same pos, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        mismatch_seq += 1
                                    elif residue.amino_acid!=wt_r.amino_acid:
                                        #print('aa mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid)
                                        aa_mismatch += 1

                                    else:
                                        match_seq += 1
                                    if wt_r.generic_number is not None:
                                        residue.display_generic_number = wt_r.display_generic_number
                                        residue.generic_number = wt_r.generic_number 
                                    else:
                                        residue.display_generic_number = None
                                        residue.generic_number = None
                                        #print('no GN')
                                    residue.protein_segment = wt_r.protein_segment
                            else:
                                #print('wierd error') #sanity check
                                residue.display_generic_number = None
                                residue.generic_number = None
                                residue.protein_segment = None

                            #print('inserted',residue.sequence_number) #sanity check
                            residue.save()

                            rotamer_data, created = PdbData.objects.get_or_create(pdb=temp)
                            rotamer, created = Rotamer.objects.get_or_create(residue=residue, structure=structure, pdbdata=rotamer_data)

                        temp = "" #start new line for rotamer
                        check = pdblines[i+1][22:26].strip()
                    
                    check = pdblines[i+1][22:26].strip()
                chain = line[21]
                residue_name = line[17:20].title() #use title to get GLY to Gly so it matches
        #print(structure.pdb_code.index,'length',len(seq),len(mapped_seq),'mapped res',str(mismatch_seq+match_seq+aa_mismatch),'pos mismatch',mismatch_seq,'aa mismatch',aa_mismatch,'not mapped',not_matched,' mapping off, matched on pos,aa',matched_by_pos)
        return None

Пример #8

Показать файл

Файл: build_g_protein_structures.py Проект: AlibekMamyrbekov/protwis

    def handle(self, *args, **options):
        startTime = datetime.datetime.now()
        self.options = options
        if self.options["purge"]:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith="_a",
                protein_conformation__protein__family__parent__parent__name=
                "Alpha").delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith="_a",
                protein__family__parent__parent__name="Alpha").delete()
            Protein.objects.filter(
                entry_name__endswith="_a",
                family__parent__parent__name="Alpha").delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options["only_signprot_structures"]:
            # Building protein and protconf objects for g protein structure in complex
            if options["s"]:
                scs = SignprotComplex.objects.filter(
                    structure__pdb_code__index__in=[
                        i.upper() for i in options["s"]
                    ])
            else:
                scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    "Protein, ProteinConformation and Residue build for alpha subunit of {} is building"
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            "_a")
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug="mod")
                        alpha_protein.source = ProteinSource.objects.get(
                            name="OTHER")
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + "_a")
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug="active")
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure("struct",
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        if "CA" in res and res.id[0] == " ":
                            nums.append(res.get_id()[1])

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ""
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == "6OIJ" and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split("\n")
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith("SEQADV"):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)",
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and "CONFLICT" in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    if options["debug"]:
                        print(sc)
                        print(mutations)
                        print(shifted_mutations)
                        print(mismatches)
                        print("======")
                        print(remaining_mismatches)
                        pprint.pprint(pdb_num_dict)

                    no_seqnum_shift = [
                        '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U',
                        '7L1V'
                    ]

                    # Check if HN is mutated to GNAI1 for the scFv16 stabilizer
                    if sc.protein.entry_name != 'gnai1_human' and len(
                            remaining_mismatches) > 0:
                        target_HN = resis.filter(protein_segment__slug='HN')
                        gnai1_HN = Residue.objects.filter(
                            protein_conformation__protein__entry_name=
                            'gnai1_human',
                            protein_segment__slug='HN')
                        pdb_HN_seq = ''
                        for num, val in pdb_num_dict.items():
                            if num <= target_HN.reverse()[0].sequence_number:
                                pdb_HN_seq += Polypeptide.three_to_one(
                                    val[0].get_resname())
                        if options['debug']:
                            print('Checking if HN is gnai1_human')
                            print(pdb_HN_seq)
                            print(''.join(
                                gnai1_HN.values_list('amino_acid', flat=True)))
                        gnai1_HN_seq = ''.join(
                            gnai1_HN.values_list('amino_acid', flat=True))
                        pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq,
                                                      3, -4, -3, -1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        length, match = 0, 0
                        for r, t in zip(ref_seq, temp_seq):
                            if options['debug']:
                                print(r, t)
                            if t != '-':
                                if r == t:
                                    match += 1
                                length += 1
                        identity = match / length * 100
                        if options['debug']:
                            print(identity)
                        if identity > 85:
                            if sc.structure.pdb_code.index not in ['7DFL']:
                                no_seqnum_shift.append(
                                    sc.structure.pdb_code.index)
                            if options['debug']:
                                print(
                                    'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction'
                                    .format(round(identity)))

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(
                            remaining_mismatches
                    ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift:
                        ppb = PPBuilder()
                        seq = ""
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        if sc.structure.pdb_code.index in [
                                '7JVQ', '7L1U', '7L1V'
                        ]:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 3, -4, -3, -1)
                        else:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])

                        # Custom fix for A->G mutation at pos 18
                        if sc.structure.pdb_code.index == '7JJO':
                            ref_seq = ref_seq[:18] + ref_seq[19:]
                            temp_seq = temp_seq[:17] + temp_seq[18:]
                        # Custom alignment fixes
                        elif sc.structure.pdb_code.index == '7DFL':
                            ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                            temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                        elif sc.structure.pdb_code.index == '7JOZ':
                            temp_seq = temp_seq[:67] + (
                                '-' * 14) + 'FNGDS' + temp_seq[86:]
                        elif sc.structure.pdb_code.index == '7AUE':
                            ref_seq = ref_seq[:31].replace('-',
                                                           '') + ref_seq[31:]
                            temp_seq = (
                                9 *
                                '-') + temp_seq[2:5] + temp_seq[5:54].replace(
                                    '-', '') + temp_seq[54:]
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            if options["debug"]:
                                print(i, ref, temp)  # alignment check
                            if ref != "-" and temp != "-":
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == "-":
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == "-":
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        # Custom fix for 7JJO isoform difference
                        if sc.structure.pdb_code.index in [
                                '7JJO', '7JOZ', '7AUE'
                        ]:
                            pdb_num_dict = OrderedDict()
                            for wt_res, st_res in wt_pdb_dict.items():
                                if type(st_res) == type([]):
                                    pdb_num_dict[wt_res.sequence_number] = [
                                        st_res[0], wt_res
                                    ]
                        else:
                            for i, r in enumerate(remaining_mismatches):
                                # Adjust for shifted residue when residue is a match
                                if r[0].get_id()[1] - remaining_mismatches[
                                        i - 1][0].get_id()[1] > 1:
                                    pdb_num_dict[r[0].get_id()[1] -
                                                 1][1] = pdb_wt_dict[chain[
                                                     r[0].get_id()[1] - 1]]
                                # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                                if r[0].get_id()[1] in shifted_mutations:
                                    pdb_num_dict[
                                        r[0].get_id()[1]][1] = resis.get(
                                            sequence_number=shifted_mutations[
                                                r[0].get_id()[1]][2])
                                # Adjust for shift
                                else:
                                    pdb_num_dict[r[0].get_id()
                                                 [1]][1] = pdb_wt_dict[r[0]]
                            if sc.structure.pdb_code.index == '7JVQ':
                                pdb_num_dict[198][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=346)
                                pdb_num_dict[235][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=383)
                            elif sc.structure.pdb_code.index == '6PB0':
                                pdb_num_dict[205][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=205)
                    ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    elif sc.structure.pdb_code.index == "6WHA":
                        ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV"
                        temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV"
                        pdb_num_dict = OrderedDict()
                        temp_resis = [res for res in chain]
                        temp_i = 0
                        mapped_cgns = []
                        for i, aa in enumerate(temp_seq):
                            if aa != "-":
                                ref_split_on_gaps = ref_seq[:i + 1].split("-")
                                ref_seqnum = i - (len(ref_split_on_gaps) -
                                                  1) + 1
                                res = resis.get(sequence_number=ref_seqnum)
                                if res.display_generic_number.label in mapped_cgns:
                                    next_presumed_cgn = self.get_next_presumed_cgn(
                                        res)
                                    if next_presumed_cgn:
                                        res = next_presumed_cgn
                                        while res and res.display_generic_number.label in mapped_cgns:
                                            res = self.get_next_presumed_cgn(
                                                res)
                                    else:
                                        print(
                                            "Error: {} CGN does not exist. Incorrect mapping of {} in {}"
                                            .format(next_presumed_cgn,
                                                    chain[nums[temp_i]],
                                                    sc.structure))
                                mapped_cgns.append(
                                    res.display_generic_number.label)
                                pdb_num_dict[nums[temp_i]] = [
                                    chain[nums[temp_i]], res
                                ]
                                temp_i += 1

                    bulked_rotamers = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            res_obj.save()
                            rot = self.create_structure_rotamer(
                                val[0], res_obj, sc.structure)
                            bulked_rotamers.append(rot)
                        else:
                            self.logger.info(
                                "Skipped {} as no annotation was present, while building for alpha subunit of {}"
                                .format(val[1], sc))
                    if options["debug"]:
                        pprint.pprint(pdb_num_dict)
                    Rotamer.objects.bulk_create(bulked_rotamers)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished"
                        .format(sc))
                except Exception as msg:
                    if options["debug"]:
                        print("Error: ", sc, msg)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed"
                        .format(sc))

        if not options["s"]:
            ### Build SignprotStructure objects from non-complex signprots
            g_prot_alphas = Protein.objects.filter(
                family__slug__startswith="100_001",
                accession__isnull=False)  #.filter(entry_name="gnai1_human")
            complex_structures = SignprotComplex.objects.all().values_list(
                "structure__pdb_code__index", flat=True)
            for a in g_prot_alphas:
                pdb_list = get_pdb_ids(a.accession)
                for pdb in pdb_list:
                    if pdb not in complex_structures:
                        try:
                            data = self.fetch_gprot_data(pdb, a)
                            if data:
                                self.build_g_prot_struct(a, pdb, data)
                        except Exception as msg:
                            self.logger.error(
                                "SignprotStructure of {} {} failed\n{}: {}".
                                format(a.entry_name, pdb, type(msg), msg))

        if options["debug"]:
            print(datetime.datetime.now() - startTime)

Пример #9

Показать файл

    def add_cgn_residues(self, gprotein_list):
        #Parsing pdb uniprot file for residues
        self.logger.info('Start parsing PDB_UNIPROT_ENSEMBLE_ALL')
        self.logger.info('Parsing file ' + self.gprotein_data_file)
        residue_data =  pd.read_table(self.gprotein_data_file, sep="\t", low_memory=False)
        residue_data = residue_data.loc[residue_data['Uniprot_ACC'].isin(gprotein_list)]
        cgn_scheme = ResidueNumberingScheme.objects.get(slug='cgn')


        # Temp files to speed things up
        temp = {}
        temp['proteins'] = {}
        temp['rgn'] = {}
        temp['segment'] = {}
        temp['equivalent'] = {}
        bulk = []
        

        self.logger.info('Insert residues: {} rows'.format(len(residue_data)))
        for index, row in residue_data.iterrows():

            if row['Uniprot_ACC'] in temp['proteins']:
                pr = temp['proteins'][row['Uniprot_ACC']][0]
                pc = temp['proteins'][row['Uniprot_ACC']][1]
            else:
                #fetch protein for protein conformation
                pr, c= Protein.objects.get_or_create(accession=row['Uniprot_ACC'])

                #fetch protein conformation
                pc, c= ProteinConformation.objects.get_or_create(protein_id=pr)
                temp['proteins'][row['Uniprot_ACC']] = [pr,pc]

            #fetch residue generic number
            rgnsp=[]


            if(int(row['CGN'].split('.')[2])<10):
                rgnsp = row['CGN'].split('.')
                rgn_new = rgnsp[0]+'.'+rgnsp[1]+'.0'+rgnsp[2]

                if rgn_new in temp['rgn']:
                    rgn = temp['rgn'][rgn_new]
                else:
                    rgn, c= ResidueGenericNumber.objects.get_or_create(label=rgn_new)
                    temp['rgn'][rgn_new] = rgn

            else:

                if row['CGN'] in temp['rgn']:
                    rgn = temp['rgn'][row['CGN']]
                else:
                    rgn, c= ResidueGenericNumber.objects.get_or_create(label=row['CGN'])
                    temp['rgn'][row['CGN']] = rgn

            #fetch protein segment id
            if row['CGN'].split(".")[1] in temp['segment']:
                ps = temp['segment'][row['CGN'].split(".")[1]]
            else:
                ps, c= ProteinSegment.objects.get_or_create(slug=row['CGN'].split(".")[1], proteinfamily='Gprotein')
                temp['segment'][row['CGN'].split(".")[1]] = ps

            try:
                bulk_r = Residue(sequence_number=row['Position'], protein_conformation=pc, amino_acid=row['Residue'], generic_number=rgn, display_generic_number=rgn, protein_segment=ps)
                # self.logger.info("Residues added to db")
                bulk.append(bulk_r)
            except:
                self.logger.error("Failed to add residues")
            if len(bulk) % 10000 == 0:
                self.logger.info('Inserted bulk {} (Index:{})'.format(len(bulk),index))
                # print(len(bulk),"inserts!",index)
                Residue.objects.bulk_create(bulk)
                # print('inserted!')
                bulk = []

             # Add also to the ResidueGenericNumberEquivalent table needed for single residue selection
            try:
                if rgn.label not in temp['equivalent']:
                    ResidueGenericNumberEquivalent.objects.get_or_create(label=rgn.label,default_generic_number=rgn, scheme=cgn_scheme)
                    temp['equivalent'][rgn.label] = 1
                # self.logger.info("Residues added to ResidueGenericNumberEquivalent")

            except:
                self.logger.error("Failed to add residues to ResidueGenericNumberEquivalent")
        self.logger.info('Inserted bulk {} (Index:{})'.format(len(bulk),index))
        Residue.objects.bulk_create(bulk)

Пример #10

Показать файл

Файл: build_construct_proteins.py Проект: akumar03/protwis

    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                            defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(name='OTHER')
                        if created:
                            self.logger.info('Created protein source {}'.format(protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type= sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except:
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # create residue records
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[ins_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': ins['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': ins['positions'][1],
                                        'segment': segment_after,
                                    },
                                }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(range((ins['positions'][0] + 1), ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={
                                'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()

Пример #11

Показать файл

Файл: build_construct_proteins.py Проект: ismaelresp/protwis

    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join(
                [self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error(
                            'Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related(
                            'protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(
                                protein__entry_name=sd['protein'],
                                state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error(
                            'Parent protein {} for construct {} not found, aborting!'
                            .format(sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(
                            slug='mod', defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(
                                sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(
                            name='OTHER')
                        if created:
                            self.logger.info(
                                'Created protein source {}'.format(
                                    protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(
                            name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type = sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info(
                            'Created construct {} with parent protein {}'.
                            format(p.name, ppc.protein.entry_name))
                    except:
                        self.logger.error(
                            'Failed creating construct {} with parent protein {}'
                            .format(p.name, ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(
                        slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info(
                            'Created conformation {} of protein {}'.format(
                                pc.state.name, p.name))
                    except:
                        self.logger.error(
                            'Failed creating conformation {} of protein {}'.
                            format(pc.state.name, p.entry_name))

                    # create residue records
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0], t[1] + 1))

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(
                                protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(
                                protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(
                                        slug=slug_1,
                                        defaults={
                                            'name':
                                            ins_start.protein_segment.name,
                                            'category':
                                            ins_start.protein_segment.category,
                                            'partial': True
                                        })
                                    if created:
                                        self.logger.info(
                                            'Created protein segment {}'.
                                            format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(
                                        slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(
                                        slug=slug_2,
                                        defaults={
                                            'name':
                                            ins_start.protein_segment.name,
                                            'category':
                                            ins_start.protein_segment.category,
                                            'partial': True
                                        })
                                    if created:
                                        self.logger.info(
                                            'Created protein segment {}'.
                                            format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(
                                        slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[
                                    ins_start.protein_segment.slug] = {
                                        'start': {
                                            'sequence_number':
                                            ins['positions'][0],
                                            'segment': segment_before,
                                        },
                                        'end': {
                                            'sequence_number':
                                            ins['positions'][1],
                                            'segment': segment_after,
                                        },
                                    }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(
                                    range((ins['positions'][0] + 1),
                                          ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(
                                name=ins['name'],
                                defaults={'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(
                                protein=p,
                                protein_fusion=fusion,
                                segment_before=segment_before,
                                segment_after=segment_after)

                    prs = Residue.objects.filter(
                        protein_conformation=ppc).prefetch_related(
                            'protein_conformation__protein', 'protein_segment',
                            'generic_number', 'display_generic_number__scheme',
                            'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug][
                                    'start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug][
                                    'end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[
                                        pr.protein_segment.
                                        slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[
                                        pr.protein_segment.
                                        slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number][
                                        'wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[
                                        r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()