예제 #1
0
def source_specific_proteins_with_ptm_mutations():

    source_models = mutation_sources()
    source_models['merged'] = None

    proteins_with_ptm_muts = {}
    kinases = {}
    kinase_groups = {}
    for name, model in tqdm(source_models.items()):
        query = (
            db.session.query(distinct(Protein.id))
            .filter(Protein.has_ptm_mutations_in_dataset(model) == True)
        )
        proteins_with_ptm_muts[name] = query.count()
        kinases[name] = (
            db.session.query(distinct(models.Kinase.id))
            .join(Protein)
            .filter(Protein.has_ptm_mutations_in_dataset(model) == True)
        ).count()
        kinase_groups[name] = (
            db.session.query(distinct(models.KinaseGroup.id))
            .join(models.Kinase)
            .join(Protein)
            .filter(Protein.has_ptm_mutations_in_dataset(model) == True)
        ).count()

    return {
        'Proteins with PTM muts': proteins_with_ptm_muts,
        'Kinases with PTM muts': kinases,
        'Kinase groups with PTM muts': kinase_groups
    }
예제 #2
0
    def test_select_preferred_isoform(self):
        proteins_data = [
            ('NM_001', 'MA', False),
            ('NM_002', 'MAA', True),
            ('NM_003', 'MAAA', True),  # we want this one:
            # canonical according to uniprot, then longest, then oldest in refseq
            ('NM_004', 'MAAA', True),
            ('NM_005', 'MAAAA', False)
        ]

        preferred_refseq = 'NM_003'

        gene = Gene(name='Gene X')
        for refseq, seq, is_uniprot_canonical in proteins_data:
            protein = Protein(refseq=refseq, sequence=seq, gene=gene)
            if is_uniprot_canonical:
                protein_references = ProteinReferences(
                    uniprot_entries=[UniprotEntry(isoform=1, reviewed=True)])
                protein.external_references = protein_references

        db.session.add(gene)

        isoform = select_preferred_isoform(gene)
        assert isoform
        assert isoform.refseq == preferred_refseq
예제 #3
0
    def test_edge_cases_mapping(self):

        gene_t = Gene(
            name='T',
            isoforms=[
                #                                 123456789
                Protein(refseq='NM_01', sequence='AXAXAYAYA'),
                # C-terminal part was trimmed
                Protein(refseq='NM_02', sequence='AXAXA'),
                # N-terminal part was trimmed
                Protein(refseq='NM_03', sequence='AYAYA'),
            ])
        db.session.add(gene_t)
        db.session.commit()

        mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'),
                            lambda s: f'{s.position}{s.residue}')

        # all sites in NM_01, the idea is to test
        sites = DataFrame.from_dict(data={
            'site at N-terminus edge': ('T', 'NM_01', 1, '^AX', 'A', 2),
            'site at C-terminus edge': ('T', 'NM_01', 9, 'YA$', 'A', 2),
        },
                                    orient='index')

        sites.columns = [
            'gene', 'refseq', 'position', 'sequence', 'residue',
            'left_sequence_offset'
        ]

        mapped_sites = mapper.map_sites_by_sequence(sites)

        assert len(mapped_sites) == 4
예제 #4
0
    def test_sites(self):

        p = Protein(**test_protein_data())

        sites = [
            Site(position=3, residue='R', type='phosphorylation'),
            Site(position=4, residue='T', type='methylation')
        ]
        db.session.add(p)
        p.sites = sites

        response = self.client.get('/protein/sites/NM_000123')

        assert response.status_code == 200
        assert response.content_type == 'application/json'

        assert len(response.json) == 2

        phospo_site_repr = None

        for site_repr in response.json:
            if site_repr['type'] == 'phosphorylation':
                phospo_site_repr = site_repr

        assert phospo_site_repr
예제 #5
0
    def test_has_sites_in_range(self):
        mutation_position = 100

        sites_result = {
            (93, 107): True,
            (92, 108): False,
            (100,): True,
            tuple(): False,
            (90,): False,
            (110,): False,
            (94, 95, 96, 97, 98, 99): True,
            (101, 102, 103, 104, 105, 106): True,
            (93,): True,
            (107,): True
        }

        for sites_positions, expected_result in sites_result.items():
            protein = Protein(
                sites=[
                    Site(position=pos)
                    for pos in sites_positions
                ]
            )
            result = protein.has_sites_in_range(mutation_position - 7, mutation_position + 7)
            assert result == expected_result
예제 #6
0
    def test_mutation(self):

        p = Protein(**test_protein_data())
        p.mutations = create_test_mutations()
        db.session.add(p)

        queries = {
            '/protein/mutation/NM_000123/1/K': 1,
            '/protein/mutation/NM_000123/1/K?filters=Mutation.sources:in:MC3':
            1,
            '/protein/mutation/NM_000123/2/K': 1,
        }
        for query, expected_results_cnt in queries.items():
            response = self.client.get(query)
            assert len(response.json) == expected_results_cnt

        response = self.client.get(
            '/protein/mutation/NM_000123/2/K?filters=Mutation.sources:in:MC3')
        assert 'Warning: There is a mutation, but it does not satisfy given filters' in response.json

        response = self.client.get('/protein/mutation/NM_000123/2/K')
        mut = response.json.pop()
        assert mut['ref'] == 'A'
        assert mut['pos'] == 2
        assert mut['alt'] == 'K'
        assert mut['protein'] == 'NM_000123'
예제 #7
0
def create_test_models():
    protein = Protein(refseq='NM_0001',
                      gene=Gene(name='SOMEGENE'),
                      sequence='ABCD')
    mutation = Mutation(protein=protein, position=1, alt='E')
    protein.gene.preferred_isoform = protein

    MC3Mutation(mutation=mutation,
                cancer=Cancer(code='CAN'),
                samples='Sample A,Sample B',
                count=2)
    InheritedMutation(mutation=mutation,
                      clin_data=[
                          ClinicalData(disease=Disease(name='Some disease'),
                                       sig_code=5),
                          ClinicalData(disease=Disease(name='Other disease'),
                                       sig_code=2)
                      ])

    protein_kinase = Protein(refseq='NM_0002',
                             gene=Gene(name='OTHERGENE'),
                             sequence='ABCD')
    kinase = Kinase(name='Kinase name', protein=protein_kinase)
    site = Site(protein=protein,
                position=1,
                residue='A',
                kinases={kinase},
                pmid={1, 2},
                types={SiteType(name='glycosylation')})
    protein.sites = [site]

    return locals()
예제 #8
0
    def test_mapping(self):

        gene_a = Gene(
            name='A',
            isoforms=[
                # the full isoform of gene A
                Protein(refseq='NM_01', sequence='AAAAAAAAAXAA'),
                # a trimmed isoform of gene A
                Protein(refseq='NM_02', sequence='AAAXAA'),
            ])
        gene_b = Gene(name='B',
                      isoforms=[
                          Protein(refseq='NM_03', sequence='BBBBBBBBBYBB'),
                          Protein(refseq='NM_04', sequence='BBBYBB'),
                      ])
        db.session.add_all([gene_a, gene_b])

        # whoops, NM_03 has be accidentally removed (!)
        db.session.delete(Protein.query.filter_by(refseq='NM_03').one())
        db.session.commit()

        mapper = SiteMapper(create_key_model_dict(Protein, 'refseq'),
                            lambda s: f'{s.position}{s.residue}')

        sites = DataFrame.from_dict(data={
            'good site A': ('A', 'NM_01', 10, 'AXA', 'X', 1),
            'lost isoform': ('B', 'NM_03', 10, 'BYB', 'Y', 1)
        },
                                    orient='index')

        sites.columns = [
            'gene', 'refseq', 'position', 'sequence', 'residue',
            'left_sequence_offset'
        ]

        mapped_sites = mapper.map_sites_by_sequence(sites)

        sites_by_isoform = group_by_isoform(mapped_sites)

        # one from NM_01 (defined), from NM_02 (mapped), from NM_04 (mapped)
        assert len(mapped_sites) == 3
        assert set(sites_by_isoform) == {'NM_01', 'NM_02', 'NM_04'}

        assert sites_by_isoform['NM_01'].residue == sites_by_isoform[
            'NM_02'].residue == 'X'
        assert sites_by_isoform['NM_01'].position == 10
        assert sites_by_isoform['NM_02'].position == 4

        assert sites_by_isoform['NM_04'].residue == 'Y'
        assert sites_by_isoform['NM_04'].position == 4

        # will the mapping to NM_02 still work if we remove 'gene' column?
        sites.drop(columns=['gene'], inplace=True)
        mapped_sites = mapper.map_sites_by_sequence(sites)
        sites_by_isoform = group_by_isoform(mapped_sites)

        assert len(mapped_sites) == 2
        assert set(sites_by_isoform) == {'NM_01', 'NM_02'}
예제 #9
0
    def test_known_mutations(self):

        p = Protein(**test_protein_data())
        p.mutations = create_test_mutations()
        db.session.add(p)

        response = self.client.get('/protein/known_mutations/NM_000123')
        muts = response.json
        assert len(muts) == 4
예제 #10
0
    def test_train_model(self):

        phosphorylation = SiteType(name='phosphorylation')

        # non-phosphorylated serine residues are needed to generate negative sites
        p = Protein(refseq='NM_007',
                    sequence='--------SLPA-----------SVIT-------')
        g = Gene(isoforms=[p], preferred_isoform=p)
        db.session.add(g)

        # phosphorylated, with sites
        p = Protein(refseq='NM_001',
                    sequence='--------SPAK-----------SPAR-------')
        g = Gene(isoforms=[p], preferred_isoform=p)
        db.session.add(g)

        k = Kinase(name='CDK1', is_involved_in={phosphorylation})

        for pos in [9, 24]:
            s = Site(position=pos,
                     types={phosphorylation},
                     residue='S',
                     protein=p,
                     kinases={k})
            db.session.add(s)

        db.session.commit()

        with TemporaryDirectory() as temp_dir:
            model = train_model(phosphorylation,
                                sequences_dir=temp_dir,
                                sampling_n=2,
                                threshold=2)

        # the model should have one set of params - for CDK1 kinase
        assert len(model) == 1

        cdk_params = model.rx2('CDK1')
        pwm = cdk_params.rx2('pwm')

        # and the position-specific weight matrix should be created
        assert pwm

        # the very detailed testing should be performed by rMIMP,
        # but why not test the basics?

        weights_of_central_aa = {
            aa: value
            for aa, value in zip(pwm.rownames, pwm.rx(True, 8))
        }
        assert weights_of_central_aa['S'] == max(
            weights_of_central_aa.values())
예제 #11
0
    def test_impact_on_ptm(self):

        mutations = [Mutation(position=61)]
        protein = Protein(refseq='NM_00001', mutations=mutations)
        db.session.add(protein)
        protein.sites = [
            Site(position=61),
            Site(position=54),
            Site(position=51)
        ]

        mutation = mutations[0]

        assert mutation.impact_on_ptm() == 'direct'
예제 #12
0
def save_protein(request, strain_id):
    theStrain = get_object_or_404(Strain, pk = strain_id)
    protein_file = request.FILES['protein_file']
    for seqRecord in SeqIO.parse(protein_file, "fasta"):
        protein = Protein()
        protein.name = seqRecord.id
        protein.seq = seqRecord.seq.tostring()
        protein.cds = CDS.objects.get(name = protein.name)
        protein.createdDate = datetime.datetime.now()
        protein.modifiedDate = datetime.datetime.now()
        protein.save()
    return HttpResponseRedirect('/strains/')
예제 #13
0
    def test_browse(self):
        p = Protein(**test_protein_data())
        db.session.add(p)

        response = self.client.get('/protein/browse', follow_redirects=True)

        assert response.status_code == 200
def create_test_data():

    mappings_filename = make_named_gz_file(raw_mappings)

    # create proteins from first three data rows
    protein_data = [
        ('NM_002749',
         'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*'
         ),
        ('NM_139033',
         'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*'
         ),
        ('NM_139034',
         'MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFDVTFDVGDEYEIIETIGNGAYGVVSSARRRLTGQQVAIKKIPNAFDVVTNAKRTLRELKILKHFKHDNIIAIKDILRPTVPYGEFKSVYVVLDLMESDLHQIIHSSQPLTLEHVRYFLYQLLRGLKYMHSAQVIHRDLKPSNLLVNENCELKIGDFGMARGLCTSPAEHQYFMTEYVATRWYRAPELMLSLHEYTQAIDLWSVGCIFGEMLARRQLFPGKNYVHQLQLIMMVLGTPSPAVIQAVGAERVRAYIQSLPPRQPVPWETVYPGADRQALSLLGRMLRFEPSARISAAAALRHPFLAKYHDPDDEPDCAPPFDFAFDREALTRERIKEAIVAEIEDFHARREGIRQQIRFQPSLQPVASEPGCPDVEMPSPWAPSGDCAMESPPPAPPPCPGPAPDTIDLTLQPPPPVSEPAPPKKDGAISDNTKAALKAALLKSLRSRLRDGPSAPLEAPEPRKPVTAQERQREREEKRRRRQERAKEREKRRQERERKERGAGASGGPSTDPLAGLVLSDNDRSLLERWTRMARPAAPALTSVPAPAPAPTPTPTPVQPTSPPPGPVAQPTGPQPQSAGSTSGPVPQPACPPPGPAPHPTGPPGPIPVPAPPQIATSTSLLAAQSLVPPPGLPGSSTPGVLPYFPPGLPPPDAGGAPQSSMSESPDVNLVTQQLSKSQVEDPLPPVFSGTPKGSGAGYGVGFDLEEFLNQSFDMGVADGPQDGQADSASLSASLLADWLEGHGMNPADIESLQREIQMDSPMLLADLPDLQDP*'
         ),
    ]

    gene = Gene(name='MAPK7')

    proteins = {
        refseq_nm: Protein(refseq=refseq_nm, sequence=sequence, gene=gene)
        for refseq_nm, sequence in protein_data
    }

    # we need to have proteins with id in session - hence commit
    db.session.add(gene)
    db.session.commit()

    return mappings_filename, gene, proteins
    def test_prepare_dataset(self):
        from views.mutation import prepare_datasets

        p = Protein(refseq='NM_000123',
                    sequence='TRAN',
                    gene=Gene(name='TP53'))
        mutation = Mutation(protein=p, position=2, alt='K')
        details = MC3Mutation(mutation=mutation, count=2)

        db.session.add(mutation)

        datasets, user_datasets = prepare_datasets(mutation)

        expected_datasets = [{
            'filter': 'Mutation.sources:in:' + source.name,
            'name': source.display_name,
            'mutation_present': False
        } if source is not MC3Mutation else {
            'filter': 'Mutation.sources:in:' + MC3Mutation.name,
            'name': MC3Mutation.display_name,
            'mutation_present': [details]
        } for source in source_manager.confirmed]

        assert datasets == expected_datasets
        assert not user_datasets
예제 #16
0
    def test_import(self):
        protein = Protein(
            refseq='NM_001741',
            sequence=
            'MGFQKFSPFLALSILVLLQAGSLHAAPFRSALESSPADPATLSEDEARLLLAALVQNYVQMKASELEQEQEREGSSLDSPRSKRCGNLSTCMLGTYTQDFNKFHTFPQTAIGVGAPGKKRDMSSDLERDHRPHVSMPQNAN*'
        )
        db.session.add(protein)

        with initialized_importer(SimpleCase, 'O-GalNAc') as importer:

            sites = importer.load_sites(site_datasets=['O-GalNAc'])

            assert len(sites) == 2

            sites_by_pos = {site.position: site for site in sites}

            assert sites_by_pos[105].residue == sites_by_pos[109].residue == 'T'
            assert sites_by_pos[105].types_names == {'O-glycosylation'}

        # check re-loading
        db.session.add_all(sites)
        db.session.commit()

        site = Site.query.filter_by(position=105).one()
        assert site.types_names == {'O-glycosylation'}
예제 #17
0
    def test_search_proteins(self):
        from views.search import search_proteins

        # create 15 genes and proteins
        mock_proteins_and_genes(15)

        # control: do we start with the mocked proteins not others?
        assert not search_proteins('TP53')

        # does respect limit? does symbol search work?
        results = search_proteins('Gene', 10)

        assert results
        assert len(results) == 10

        assert results[0].name.startswith('Gene')

        # are results sorted?
        db.session.add_all([
            Gene(name=name, preferred_isoform=Protein(refseq='NM_%s' % 20 * i))
            for i, name in enumerate(['TPK', 'TPKK'])
        ])
        results = search_proteins('TPK', 2)
        assert results[0].name == 'TPK'
        assert results[0].best_score < results[1].best_score

        # does include both: refseq and symbol search?
        assert search_proteins('NM_0003', 1)

        # can we change subset of searched features?
        assert not search_proteins('NM_0003', 1, features=['gene_symbol'])
        assert not search_proteins('Gene', 1, features=['refseq'])
예제 #18
0
    def test_sites(self):

        mutations = [Mutation(position=x) for x in (0, 5, 12, 57)]

        protein = Protein(refseq='NM_00002',
                          mutations=mutations,
                          sites=[Site(position=x) for x in (10, 14, 15, 57)])

        db.session.add(protein)
        db.session.commit()

        # ==test_find_closest_sites==

        # for mutation at position 0 there is no closest site;
        # for mutation at position 5 there should be 1 closest site
        expected_closest_sites = dict(zip(mutations, [0, 1, 2, 1]))

        for mutation, expected_sites_cnt in expected_closest_sites.items():
            sites_found = mutation.find_closest_sites()
            assert len(sites_found) == expected_sites_cnt

        # ==test_get_affected_ptm_sites==

        expected_affected_sites = dict(zip(mutations, [0, 1, 3, 1]))

        for mutation, expected_sites_cnt in expected_affected_sites.items():
            sites_found = mutation.get_affected_ptm_sites()
            assert len(sites_found) == expected_sites_cnt
예제 #19
0
    def test_types(self):
        methylation = SiteType(name='methylation')

        p = Protein(refseq='NM_007', id=1, sequence='ABCD')
        db.session.add(p)

        site = Site(position=2, types={methylation}, residue='B', protein=p)
        db.session.add(site)

        db.session.commit()

        query = Protein.query

        assert query.filter(Protein.sites.any(
            Site.types.contains(methylation))).one()
        assert not query.filter(
            Protein.sites.any(~Site.types.contains(methylation))).all()
        assert Site.query.filter(Site.types.contains(methylation)).count() == 1
        assert not Site.query.filter(~Site.types.contains(methylation)).all()

        phosphorylation = SiteType(name='phosphorylation')
        assert not query.filter(
            Protein.sites.any(Site.types.contains(phosphorylation))).all()
        assert query.filter(
            Protein.sites.any(~Site.types.contains(phosphorylation))).one()
        assert Site.query.filter(
            Site.types.contains(phosphorylation)).count() == 0
예제 #20
0
def test_prepare_tracks():
    protein = Protein(refseq='NM_01',
                      sequence='123456789',
                      sites=[Site(position=4)])
    sequence_track = SequenceTrack(protein)
    site = sequence_track.subtracks[0].elements[0]
    assert site.start >= 0
예제 #21
0
def create_test_proteins(refseqs) -> Dict[str, Protein]:
    # reset cache
    proteins = get_proteins(reload_cache=True)

    for refseq in refseqs:
        proteins[refseq] = Protein(refseq=refseq)

    return proteins
예제 #22
0
def mock_proteins_and_genes(count):
    from database import db
    from models import Gene, Protein
    for i in range(count):
        g = Gene(name='Gene_%s' % i, full_name='Full name of gene %s' % i)
        p = Protein(refseq='NM_000%s' % i, gene=g)
        g.preferred_isoform = p
        db.session.add(g)
예제 #23
0
    def test_refseq(self):
        from search.gene import RefseqGeneSearch

        # create 15 genes and proteins
        mock_proteins_and_genes(10)

        search = RefseqGeneSearch().search

        # negative control
        for phase in ['9999', 'NM_00000', 'Gene']:
            assert not search(phase)

        # limiting
        results = search('NM_', limit=5)
        assert len(results) == 5

        assert results[0].name.startswith('Gene')

        # test the search itself
        for refseq in ['NM_0003', 'nm_0003', '0003']:
            results = search(refseq)
            assert len(results) == 1
            assert results[0].name == 'Gene_3'

            isoforms = results[0].matched_isoforms
            assert len(isoforms) == 1
            assert isoforms.pop().refseq == 'NM_0003'

        db.session.add_all([
            Gene(name='Gene X',
                 isoforms=[
                     Protein(refseq='NM_000301'),
                     Protein(refseq='NM_000302'),
                 ]),
            Gene(name='Gene Y', isoforms=[Protein(refseq='NM_000309')])
        ])

        # so there are three genes with isoforms starting with NM_0003
        # (those are Gene_3, Gene X, Gene Y). Let see if limiting work
        # well when applied per-gene.

        queries = {'NM_0003': 2, 'NM_00030': 2, 'NM_000301': 1, 'NM_000302': 1}

        for query, expected_result in queries.items():
            assert len(search(query, limit=2)) == expected_result
예제 #24
0
    def test_domains(self):
        proteins = [
            Protein(
                refseq='NM_018163',
                sequence=
                'MAVTKELLQMDLYALLGIEEKAADKEVKKAYRQKALSCHPDKNPDNPRAAELFHQLSQALEVLTDAAARAAYDKVRKAKKQAAERTQKLDEKRKKVKLDLEARERQAQAQESEEEEESRSTRTLEQEIERLREEGSRQLEEQQRLIREQIRQERDQRLRGKAENTEGQGTPKLKLKWKCKKEDESKGGYSKDVLLRLLQKYGEVLNLVLSSKKPGTAVVEFATVKAAELAVQNEVGLVDNPLKISWLEGQPQDAVGRSHSGLSKGSVLSERDYESLVMMRMRQAAERQQLIARMQQEDQEGPPT*',
                gene=Gene(chrom='15')),
            Protein(
                refseq='NM_004671',
                sequence=
                'MADFEELRNMVSSFRVSELQVLLGFAGRNKSGRKHDLLMRALHLLKSGCSPAVQIKIRELYRRRYPRTLEGLSDLSTIKSSVFSLDGGSSPVEPDLAVAGIHSLPSTSVTPHSPSSPVGSVLLQDTKPTFEMQQPSPPIPPVHPDVQLKNLPFYDVLDVLIKPTSLVQSSIQRFQEKFFIFALTPQQVREICISRDFLPGGRRDYTVQVQLRLCLAETSCPQEDNYPNSLCIKVNGKLFPLPGYAPPPKNGIEQKRPGRPLNITSLVRLSSAVPNQISISWASEIGKNYSMSVYLVRQLTSAMLLQRLKMKGIRNPDHSRALIKEKLTADPDSEIATTSLRVSLMCPLGKMRLTIPCRAVTCTHLQCFDAALYLQMNEKKPTWICPVCDKKAAYESLILDGLFMEILNDCSDVDEIKFQEDGSWCPMRPKKEAMKVSSQPCTKIESSSVLSKPCSVTVASEASKKKVDVIDLTIESSSDEEEDPPAKRKCIFMSETQSSPTKGVLMYQPSSVRVPSVTSVDPAAIPPSLTDYSVPFHHTPISSMSSDLPGLDFLSLIPVDPQYCPPMFLDSLTSPLTASSTSVTTTSSHESSTHVSSSSSRSETGVITSSGSNIPDIISLD*',
                gene=Gene(chrom='18'))
        ]

        db.session.add_all(proteins)

        filename = make_named_temp_file(domains_data)

        new_domains = load_domains(filename)

        assert len(new_domains) == 6
        assert len(proteins[0].domains) == 2

        domains = defaultdict(list)

        for domain in proteins[1].domains:
            domains[domain.interpro.short_description].append(domain)

        def assert_ranges(domain, start, end):
            assert domain.start == start and domain.end == end

        # two SAP domains should be merged for representation purposes due to similarity criteria
        # (these two domain annotation overlap so the smaller one is contained in the bigger)
        sap_domain = domains['SAP_dom'][0]
        assert_ranges(sap_domain, 1, 65)

        intepro_domain = sap_domain.interpro
        assert intepro_domain.accession == 'IPR003034'
        assert intepro_domain.description == 'SAP domain'

        # here the two annotations overlap with more than 75% of common
        assert_ranges(domains['PINIT'][0], 141, 299)

        # and here overlap was too small to merge those domains
        assert len(domains['Znf_MIZ']) == 2
예제 #25
0
def make_test_shared_proteins():

    proteins = [
        Protein(
            refseq='NM_004318',
            sequence=
            'MAQRKNAKSSGNSSSSGSGSGSTSAGSSSPGARRETKHGGHKNGRKGGLSGTSFFTWFMVIALLGVWTSVAVVWFDLVDYEEVLGKLGIYDADGDGDFDVDDAKVLLGLKERSTSEPAVPPEEAEPHTEPEEQVPVEAEPQNIEDEAKEQIQSLLHEMVHAEHVEGEDLQQEDGPTGEPQQEDDEFLMATDVDDRFETLEPEVSHEETEHSYHVEETVSQDCNQDMEEMMSEQENPDSSEPVVEDERLHHDTDDVTYQVYEEQAVYEPLENEGIEITEVTAPPEDNPVEDSQVIVEEVSIFPVEEQQEVPPETNRKTDDPEQKAKVKKKKPKLLNKFDKTIKAELDAAEKLRKRGKIEEAVNAFKELVRKYPQSPRARYGKAQCEDDLAEKRRSNEVLRGAIETYQEVASLPDVPADLLKLSLKRRSDRQQFLGHMRGSLLTLQRLVQLFPNDTSLKNDLGVGYLLIGDNDNAKKVYEEVLSVTPNDGFAKVHYGFILKAQNKIAESIPYLKEGIESGDPGTDDGRFYFHLGDAMQRVGNKEAYKWYELGHKRGHFASVWQRSLYNVNGLKAQPWWTPKETGYTELVKSLERNWKLIRDEGLAVMDKAKGLFLPEDENLREKGDWSQFTLWQQGRRNENACKGAPKTCTLLEKFPETTGCRRGQIKYSIMHPGTHVWPHTGPTNCRLRMHLGLVIPKEGCKIRCANETKTWEEGKVLIFDDSFEHEVWQDASSFRLIFIVDVWHPELTPQQRRSLPAI*'
        ),
        Protein(
            refseq='NM_020164',
            sequence=
            'MAEDKETKHGGHKNGRKGGLSGTSFFTWFMVIALLGVWTSVAVVWFDLVDYEEVLAKAKDFRYNLSEVLQGKLGIYDADGDGDFDVDDAKVLLEGPSGVAKRKTKAKVKELTKEELKKEKEKPESRKESKNEERKKGKKEDVRKDKKIADADLSRKESPKGKKDREKEKVDLEKSAKTKENRKKSTNMKDVSSKMASRDKDDRKESRSSTRYAHLTKGNTQKRNG*'
        )
    ]

    db.session.add_all(proteins)

    db.session.commit()
예제 #26
0
def create_test_kinase(name, refseq):

    interactor = Kinase(name=name)

    kinase_gene = Gene(name='Gene of ' + interactor.name)
    kinase_protein = Protein(refseq=refseq, gene=kinase_gene)

    interactor.protein = kinase_protein

    return interactor
예제 #27
0
    def test_search_mutations(self):

        s = Site(position=13, types={SiteType(name='methylation')})
        p = Protein(refseq='NM_007', id=7, sites=[s], sequence='XXXXXXXXXXXXV')

        m_in_site = Mutation(protein=p, position=13, alt='V')
        m_out_site = Mutation(protein=p, position=50, alt='K')

        db.session.add(p)

        # points to the same location as first record in VCF_FILE_CONTENT
        test_query = 'chr20 14370 G A'

        from database import bdb

        # map the first genomic mutation from VCF_FILE_CONTENT
        # to some (mocked) protein mutation
        bdb.add_genomic_mut('20', 14370, 'G', 'A', m_in_site, is_ptm=True)

        #
        # basic test - is appropriate mutation in results?
        #
        response = self.search_mutations(mutations=test_query)

        assert response.status_code == 200

        # this mutation is exactly at a PTM site and should be included in results
        assert '<td>{0}</td>'.format(m_in_site.alt).encode() in response.data
        # this mutation lies outside of a PTM site - be default should be filtered out
        assert '<td>{0}</td>'.format(m_out_site.alt).encode() not in response.data

        #
        # count test - is mutation for this query annotated as shown twice?
        #
        response = self.search_mutations(
            mutations='{0}\n{0}'.format(test_query)
        )

        assert response.status_code == 200
        assert b'<td>2</td>' in response.data

        #
        # VCF file test
        #
        response = self.client.post(
            '/search/mutations',
            content_type='multipart/form-data',
            data={
                'vcf-file': (BytesIO(VCF_FILE_CONTENT), 'exemplar_vcf.vcf')
            }
        )

        assert response.status_code == 200
        assert b'NM_007' in response.data
예제 #28
0
def test_trim_ends():
    # protein sequences are 1-based
    track = SequenceTrack(Protein(sequence='1234567890'))

    element = [-5, 10]  #
    track.trim_ends([element])
    assert element == [1, 4]

    element = [5, 10]  # 567890----
    track.trim_ends([element])
    assert element == [5, 6]  # should include 0
예제 #29
0
    def test_redirect(self):

        p = Protein(**test_protein_data())
        db.session.add(p)

        response = self.client.get(
            '/protein/show/NM_000123?filters=Mutation.sources:in:MC3')
        assert response.status_code == 302
        assert relative_location(
            response
        ) == '/sequence/show/NM_000123?filters=Mutation.sources:in:MC3'
예제 #30
0
    def test_map_site_to_isoform(self):

        mapper = SiteMapper([], lambda s: f'{s.position}{s.sequence}')

        site = RawSite(sequence='FIN', position=6, left_sequence_offset=1)
        protein = Protein(sequence='LKIQYTKIFINNEWHDSVSG')

        assert mapper.map_site_to_isoform(site, protein) == [10]

        with warns(UserWarning, match='More than one match for: 2KI'):
            site = RawSite(sequence='KI', position=2, left_sequence_offset=0)
            assert mapper.map_site_to_isoform(site, protein) == [2, 7]
예제 #31
0
def loadProtein(request):
    f = open('C:/Users/anna/Desktop/Doktorat/typeii/src/typeii/sourceData/nonsystemalso_new.csv')
    proteins = []
    headers = 2
    '''#uncomment to use test mode (upload only 5 records)
    temp = 40#test'''
    
    # possibly needed to indicate protein affiliation
    #genomes = Genome.objects.all()
    #pieces = DNAPiece.objects.all()
    systems = System.objects.all()
    for line in f:
        data = line.split(',')
        gene_id = data[0].replace('"', '')
        if gene_id in proteins or headers != 0:
            headers -= 1
            pass
        else:
            system = int(data[4].replace('"', ''))
            genome_location = data[8].replace('"', '')
            if data[9].replace('"', '') == '+':
                strand = '+'
            else:
                strand = '-'
            margin_left = int(data[10].replace('"', ''))
            system_part = data[11].replace('"', '')
            clans_cluster = '' #fixed value till data available
            hammer_cluster = data[12].replace('"', '')
            subunit_kind = data[13].replace('"', '')
            dna_length = int(data[15].replace('"', ''))
            aa_sequence = data[17].replace('"', '')
            hh_pfam_id = data[18].replace('"', '')
            hh_pfam_short_desc = data[19].replace('"', '')
            hh_probability_raw = data[20].replace('"','')
            if hh_probability_raw != '':
                hh_probability = decimal.Decimal(hh_probability_raw)
            else:
                hh_probability = decimal.Decimal(0.0) #default for records without hh value
            hh_probability.quantize(decimal.Decimal('.01'))
            hh_e_value = data[21].replace('"', '')
            hh_pfam_desc = data[22].replace('"', '')
            m_probability = 0 #fixed value till data from Vilno
            r_probability = 0 #fixed value till data from Vilno
            s_probability = 0 #fixed value till data from Vilno
            print (gene_id)
            proteins.append(gene_id)
            for record in systems:
                if record.id == system:
                    #uncomment for verbose upload
                    print(record.id, gene_id + ' ' + genome_location + ' ' +
                                strand, margin_left, system_part + ' ' +
                                clans_cluster + ' ' + hammer_cluster + ' ' +
                                subunit_kind, dna_length, aa_sequence + ' ' +
                                hh_pfam_id + ' ' + hh_pfam_short_desc,
                                hh_probability, hh_e_value + ' ' +
                                hh_pfam_desc, m_probability,
                                r_probability, s_probability)
                    p = Protein(system=record, gene_id=gene_id, genome_location=genome_location,
                                strand=strand, margin_left=margin_left, system_part=system_part,
                                clans_cluster=clans_cluster, hammer_cluster=hammer_cluster,
                                subunit_kind=subunit_kind, dna_length=dna_length, aa_sequence=aa_sequence,
                                hh_pfam_id=hh_pfam_id, hh_pfam_short_desc=hh_pfam_short_desc,
                                hh_probability=hh_probability, hh_e_value=hh_e_value,
                                hh_pfam_desc=hh_pfam_desc, m_probability=m_probability,
                                r_probability=r_probability, s_probability=s_probability)
                    p.save()
                    print('Protein appended.')
                    '''#uncomment to use test mode (upload only 5 records)
                    temp -= 1#test
        if temp == 0:#test
            break#test'''
    return HttpResponse('Download complete.')