예제 #1
0
    def test_divide_muts_by_sites(self):
        from views.network import divide_muts_by_sites

        # check if null case works
        divide_muts_by_sites([], [])

        # one site
        s_1 = Site(position=1)
        muts_by_sites = divide_muts_by_sites([], [s_1])
        assert muts_by_sites[s_1] == []

        # full test
        s_2 = Site(position=10)
        s_3 = Site(position=20)

        muts_by_pos = {
            pos: [Mutation(position=pos)]
            for pos in (1, 2, 8, 14, 16, 30)
        }

        muts_by_pos[16].append(Mutation(position=16))

        def get_muts_from_pos(*positions):
            lists = [muts_by_pos[p] for p in positions]
            return [mut for mut_list in lists for mut in mut_list]

        muts_by_sites = divide_muts_by_sites([
            mut for muts_on_pos_x in muts_by_pos.values()
            for mut in muts_on_pos_x
        ], [s_1, s_2, s_3])

        assert muts_by_sites[s_1] == get_muts_from_pos(1, 2, 8)
        assert muts_by_sites[s_2] == get_muts_from_pos(8, 14, 16)
        assert muts_by_sites[s_3] == get_muts_from_pos(14, 16)
예제 #2
0
    def get_filter_by_sources(sources):

        filters = and_(((Mutation.get_relationship(source).any()
                         if are_details_managed(source) else
                         Mutation.get_relationship(source).has())
                        for source in sources))

        return filters
예제 #3
0
def create_network():
    p = create_test_protein()
    cancer = Cancer(name='Ovarian', code='OV')

    known_interactor_of_x = create_test_kinase('Kinase Y', 'NM_0009')

    kinase_mutation = Mutation(position=1,
                               alt='T',
                               meta_MC3=[MC3Mutation(cancer=cancer)])

    known_interactor_of_x.protein.mutations = [kinase_mutation]

    drug = Drug(
        name='Drug targeting ' + known_interactor_of_x.name,
        drug_bank_id='DB01',
        target_genes=[known_interactor_of_x.protein.gene],
        # by default only approved drugs are shown
        groups={DrugGroup(name='approved')})

    group = KinaseGroup(name='Group of kinases', )
    s = Site(position=1,
             type='phosphorylation',
             residue='T',
             kinases=[known_interactor_of_x],
             kinase_groups=[group])
    s2 = Site(position=2,
              type='phosphorylation',
              residue='R',
              kinase_groups=[group])
    p.sites = [s, s2]

    predicted_interactor = create_test_kinase('Kinase Z', 'NM_0002')

    protein_mutation = Mutation(position=2,
                                alt='T',
                                meta_MC3=[MC3Mutation(cancer=cancer)],
                                meta_MIMP=[
                                    MIMPMutation(
                                        pwm=known_interactor_of_x.name,
                                        effect='loss',
                                        site=s,
                                        probability=0.1,
                                        position_in_motif=1),
                                    MIMPMutation(pwm=predicted_interactor.name,
                                                 effect='gain',
                                                 site=s,
                                                 probability=0.1,
                                                 position_in_motif=1)
                                ])

    p.mutations = [protein_mutation]
    db.session.add_all([p, drug, predicted_interactor])
    db.session.commit()

    # a new cancer was added, reload is necessary (this should not happen during normal app usage)
    from website.views.filters import cached_queries
    cached_queries.reload()
예제 #4
0
    def test_search_mutations(self):

        s = Site(position=13, types={SiteType(name='methylation')})
        p = Protein(refseq='NM_007', id=7, sites=[s], sequence='XXXXXXXXXXXXV')

        m_in_site = Mutation(protein=p, position=13, alt='V')
        m_out_site = Mutation(protein=p, position=50, alt='K')

        db.session.add(p)

        # points to the same location as first record in VCF_FILE_CONTENT
        test_query = 'chr20 14370 G A'

        from database import bdb

        # map the first genomic mutation from VCF_FILE_CONTENT
        # to some (mocked) protein mutation
        bdb.add_genomic_mut('20', 14370, 'G', 'A', m_in_site, is_ptm=True)

        #
        # basic test - is appropriate mutation in results?
        #
        response = self.search_mutations(mutations=test_query)

        assert response.status_code == 200

        # this mutation is exactly at a PTM site and should be included in results
        assert '<td>{0}</td>'.format(m_in_site.alt).encode() in response.data
        # this mutation lies outside of a PTM site - be default should be filtered out
        assert '<td>{0}</td>'.format(m_out_site.alt).encode() not in response.data

        #
        # count test - is mutation for this query annotated as shown twice?
        #
        response = self.search_mutations(
            mutations='{0}\n{0}'.format(test_query)
        )

        assert response.status_code == 200
        assert b'<td>2</td>' in response.data

        #
        # VCF file test
        #
        response = self.client.post(
            '/search/mutations',
            content_type='multipart/form-data',
            data={
                'vcf-file': (BytesIO(VCF_FILE_CONTENT), 'exemplar_vcf.vcf')
            }
        )

        assert response.status_code == 200
        assert b'NM_007' in response.data
    def test_prepare_dataset(self):
        from views.mutation import prepare_datasets

        p = Protein(refseq='NM_000123',
                    sequence='TRAN',
                    gene=Gene(name='TP53'))
        mutation = Mutation(protein=p, position=2, alt='K')
        details = MC3Mutation(mutation=mutation, count=2)

        db.session.add(mutation)

        datasets, user_datasets = prepare_datasets(mutation)

        expected_datasets = [{
            'filter': 'Mutation.sources:in:' + source.name,
            'name': source.display_name,
            'mutation_present': False
        } if source is not MC3Mutation else {
            'filter': 'Mutation.sources:in:' + MC3Mutation.name,
            'name': MC3Mutation.display_name,
            'mutation_present': [details]
        } for source in source_manager.confirmed]

        assert datasets == expected_datasets
        assert not user_datasets
예제 #6
0
    def test_sites(self):

        mutations = [Mutation(position=x) for x in (0, 5, 12, 57)]

        protein = Protein(refseq='NM_00002',
                          mutations=mutations,
                          sites=[Site(position=x) for x in (10, 14, 15, 57)])

        db.session.add(protein)
        db.session.commit()

        # ==test_find_closest_sites==

        # for mutation at position 0 there is no closest site;
        # for mutation at position 5 there should be 1 closest site
        expected_closest_sites = dict(zip(mutations, [0, 1, 2, 1]))

        for mutation, expected_sites_cnt in expected_closest_sites.items():
            sites_found = mutation.find_closest_sites()
            assert len(sites_found) == expected_sites_cnt

        # ==test_get_affected_ptm_sites==

        expected_affected_sites = dict(zip(mutations, [0, 1, 3, 1]))

        for mutation, expected_sites_cnt in expected_affected_sites.items():
            sites_found = mutation.get_affected_ptm_sites()
            assert len(sites_found) == expected_sites_cnt
예제 #7
0
def count_by_sources(sources: List[MutationSource],
                     site_type: SiteType,
                     primary_isoforms=True,
                     by_genes=True,
                     genes=None,
                     muts_conjunction=or_,
                     **kwargs):

    base_query = Mutation.query.filter(
        muts_conjunction(*[Mutation.in_sources(source) for source in sources]))

    if primary_isoforms:
        base_query = base_query.join(Protein).filter(
            Protein.is_preferred_isoform)

    sites = Site.query.filter(Site.types.contains(site_type))

    counter = MotifsCounter(site_type)

    if not by_genes:
        return counter.count_muts_and_sites(base_query, sites, **kwargs)

    counts_by_genes = {}

    if not genes:
        genes = Gene.query.all()

    for gene in tqdm(genes):

        query = base_query.filter(Mutation.protein == gene.preferred_isoform)
        gene_sites = sites.filter(Site.protein == gene.preferred_isoform)
        counts_by_genes[gene.name] = counter.count_muts_and_sites(
            query, gene_sites, show_progress=False, **kwargs)

    return counts_by_genes
예제 #8
0
def create_test_models():
    protein = Protein(refseq='NM_0001',
                      gene=Gene(name='SOMEGENE'),
                      sequence='ABCD')
    mutation = Mutation(protein=protein, position=1, alt='E')
    protein.gene.preferred_isoform = protein

    MC3Mutation(mutation=mutation,
                cancer=Cancer(code='CAN'),
                samples='Sample A,Sample B',
                count=2)
    InheritedMutation(mutation=mutation,
                      clin_data=[
                          ClinicalData(disease=Disease(name='Some disease'),
                                       sig_code=5),
                          ClinicalData(disease=Disease(name='Other disease'),
                                       sig_code=2)
                      ])

    protein_kinase = Protein(refseq='NM_0002',
                             gene=Gene(name='OTHERGENE'),
                             sequence='ABCD')
    kinase = Kinase(name='Kinase name', protein=protein_kinase)
    site = Site(protein=protein,
                position=1,
                residue='A',
                kinases={kinase},
                pmid={1, 2},
                types={SiteType(name='glycosylation')})
    protein.sites = [site]

    return locals()
예제 #9
0
def mutation_sources():
    sources = {}

    for name, source in Mutation.sources_dict.items():
        if name == 'user':
            continue
        sources[name] = Mutation.get_source_model(name)

    return sources
예제 #10
0
def source_specific_mutated_sites():

    muts_in_ptm_sites = {}
    mimp_muts = {}
    mutated_sites = defaultdict(dict)

    site_type_queries = [models.SiteType(name='')]  # empty will match all sites
    site_type_queries.extend(models.SiteType.query)

    for name, model in mutation_sources().items():
        count = (
            Mutation.query
            .filter_by(is_confirmed=True, is_ptm_distal=True)
            .filter(Mutation.in_sources(model))
            .count()
        )
        muts_in_ptm_sites[name] = count

        mimp_muts[name] = (
            Mutation.query
            .filter(
                and_(
                    Mutation.in_sources(models.MIMPMutation, model),
                    Mutation.is_confirmed,
                )
            ).count()
        )

        for site_type in tqdm(site_type_queries):
            mutated_sites[name][site_type] = count_mutated_sites([site_type], model)

    all_mutated_sites = {}

    for site_type in tqdm(site_type_queries):
        all_mutated_sites[site_type] = count_mutated_sites([site_type])

    mutated_sites['merged'] = all_mutated_sites

    return {
        'Mutations - in PTM sites': muts_in_ptm_sites,
        'Mutations - with network-rewiring effect': mimp_muts,
        'PTM sites affected by mutations': mutated_sites
    }
예제 #11
0
 def with_significant_genes(self, significant_gene_list_name):
     query = request.args.get('query', '')
     gene_list = GeneList.query.filter_by(
         name=significant_gene_list_name).first_or_404()
     dataset = Mutation.get_source_model(gene_list.mutation_source_name)
     return template('pathway/significant.html',
                     gene_list=gene_list,
                     dataset=dataset,
                     endpoint='significant_data',
                     endpoint_kwargs={'gene_list_id': gene_list.id},
                     query=query)
예제 #12
0
def get_genes_with_mutations_from_sources(sources,
                                          only_genes_with_ptm_sites=False):
    query = (db.session.query(Gene).join(
        Protein, Gene.preferred_isoform_id == Protein.id).join(Mutation))
    query = query.filter(Mutation.in_sources(*sources))

    genes = set(query.distinct())

    if only_genes_with_ptm_sites:
        return {gene for gene in genes if gene.preferred_isoform.sites}
    return genes
예제 #13
0
def mutated_ptm_sites_in_proximity(mutation_source,
                                   type_1: str,
                                   type_2: str,
                                   mutation_filter=True,
                                   distance: int = 7,
                                   only_preferred=True) -> int:
    sites, (site_1, site_2) = ptm_sites_in_proximity(type_1, type_2, distance,
                                                     only_preferred)
    for site in (site_1, site_2):
        sites = sites.filter(
            site.affected_by_mutations.any(
                and_(Mutation.in_sources(mutation_source), mutation_filter)))
    return sites
예제 #14
0
def most_mutated_sites(sources: List[MutationSource],
                       site_type: SiteType = None,
                       limit=25,
                       intersection=True,
                       exclusive=None,
                       mutation_filter=None):
    """Sources must have the same value_type (counts/frequencies)"""

    assert not (intersection and exclusive)

    counts = prepare_for_summing(sources)

    query = (db.session.query(
        Site, *[count.label(f'count_{i}')
                for i, count in enumerate(counts)]).select_from(Mutation))

    if intersection:
        for source in sources:
            query = query.join(source)
    else:
        for source in sources:
            query = query.outerjoin(source)

        if exclusive:
            query = query.filter(~Mutation.in_sources(*exclusive))

    if mutation_filter is not None:
        query = query.filter(mutation_filter)

    query = (query.join(Mutation.affected_sites).filter(
        Site.protein.has(Protein.is_preferred_isoform)))

    if site_type:
        query = query.filter(SiteType.fuzzy_filter(site_type, join=True))

    query = (query.group_by(Site).having(and_(*counts)))

    query = query.subquery()

    total_muts_count = reduce(
        operator.add,
        [getattr(query.c, f'count_{i}') for i in range(len(counts))])

    total_muts_count = total_muts_count.label('mutations_count')

    query = (db.session.query(
        aliased(Site, query),
        total_muts_count,
    ).order_by(desc(total_muts_count)))

    return query.limit(limit)
예제 #15
0
    def test_mutate(self):

        p = Protein(sequence='ABCDE')
        s = Site(protein=p, position=3, residue='C')

        cases = {
            1: 'XBCDE',
            3: 'ABXDE',
        }

        for position, expected_seq in cases.items():
            m = Mutation(protein=p, position=position, alt='X')

            assert mutate_sequence(s, m, offset=2) == expected_seq
예제 #16
0
    def test_show(self):

        p = Protein(refseq='NM_000123',
                    sequence='TRAN',
                    gene=Gene(name='TP53'))
        mutation = Mutation(protein=p, position=2, alt='K')

        db.session.add(mutation)

        response = self.client.get('/mutation/show/NM_000123/2/K')

        assert response.status_code == 200
        assert b'TP53' in response.data
        assert b'NM_000123' in response.data
예제 #17
0
    def test_impact_on_ptm(self):

        mutations = [Mutation(position=61)]
        protein = Protein(refseq='NM_00001', mutations=mutations)
        db.session.add(protein)
        protein.sites = [
            Site(position=61),
            Site(position=54),
            Site(position=51)
        ]

        mutation = mutations[0]

        assert mutation.impact_on_ptm() == 'direct'
예제 #18
0
def get_confirmed_mutations(sources,
                            only_preferred=True,
                            genes=None,
                            confirmed_by_definition=False,
                            base_query=None):
    """
    Utility to generate a query for retrieving confirmed mutations having specific mutation details.

    Args:
        sources: list of mutation details (sources) to be used to filter
            the mutations (including sources with non-confirmed mutations)
        only_preferred: include only mutations from preferred isoforms
        genes: limit to genes from provided list
        confirmed_by_definition: do not apply the expensive is_confirmed=True
            filter as all sources include only confirmed mutations
        base_query: the initial mutation query (allows to adjust selected columns)

    Returns:
        Query object yielding mutations.
    """

    if not base_query:
        base_query = Mutation.query

    mutations = base_query

    def only_from_primary_isoforms(mutations_query):

        mutations_query = join_unique(mutations_query, Protein)
        return mutations_query.filter(Protein.is_preferred_isoform)

    if not confirmed_by_definition:
        mutations = mutations.filter_by(is_confirmed=True)

    # TODO: remove?
    mutations = only_from_primary_isoforms(mutations)

    if genes:
        mutations = mutations.filter(
            Protein.id.in_([g.preferred_isoform_id for g in genes]))

    selected_mutations = mutations.filter(Mutation.in_sources(*sources))

    if only_preferred:
        selected_mutations = only_from_primary_isoforms(selected_mutations)

    return selected_mutations
예제 #19
0
def count_mutated_sites(
    site_types: Iterable[models.SiteType]=tuple(), model=None,
    only_primary=False, disordered=None, custom_filter=None
):
    filters = [
        Mutation.protein_id == Protein.id,
        Site.protein_id == Protein.id,
        Mutation.precomputed_is_ptm
    ]
    for site_type in site_types:
        filters.append(models.SiteType.fuzzy_filter(site_type))
    if custom_filter is not None:
        filters.append(custom_filter)
    if disordered is not None:
        filters.append(Site.in_disordered_region == disordered)
    query = (
        db.session.query(
            func.count(distinct(case(
                [
                    (
                        (
                            Mutation.position.between(
                                Site.position - 7,
                                Site.position + 7
                            )
                        ),
                        Site.id
                    )
                ],
                else_=literal_column('NULL')
            )))
        )
        .filter(and_(*filters))
        .join(Mutation, Site.protein_id == Mutation.protein_id)
    )
    if model:
        query = query.filter(Mutation.in_sources(model))
    else:
        query = query.filter(Mutation.is_confirmed == True)

    if only_primary:
        query = query.join(Protein).filter(Protein.is_preferred_isoform)

    return query.scalar()
예제 #20
0
def mutation_by_source(combination,
                       site_type=None,
                       only_within_ptm_sites=False,
                       only_primary=False):

    query = (Mutation.query.filter(Mutation.in_sources(*combination)))

    if only_within_ptm_sites:
        # query = query.filter(Mutation.is_ptm_distal == True)
        query = query.filter(Mutation.precomputed_is_ptm)

    if site_type:
        query = query.filter(
            Mutation.affected_sites.any(Site.types.contains(site_type)))

    if only_primary:
        query = query.join(Protein).filter(Protein.is_preferred_isoform)

    return query.count()
예제 #21
0
def count_mutations_from_genes(genes,
                               sources,
                               only_preferred_isoforms=False,
                               strict=True):
    """Counts mutations and PTM mutations from isoforms from given set of genes.

    Args:
        genes: a list of Gene
        only_preferred_isoforms: should only one isoform per gene
            (the preferred/primary one) be used when filtering mutations?
        sources: a list of MutationDetails - only confirmed mutations from
            sources identified by given MutationDetail classes will be counted
    """
    all_mutations_count = 0
    ptm_mutations_count = 0

    if strict:
        base_query = (db.session.query(
            Mutation.position, Mutation.alt,
            Protein.id).select_from(Mutation).join(Protein))
    else:
        base_query = Mutation.query

    for gene in tqdm(genes):
        if only_preferred_isoforms:
            proteins = [gene.preferred_isoform]
        else:
            proteins = gene.isoforms

        mutations_filters = and_(
            Mutation.protein_id.in_([p.id for p in proteins]),
            Mutation.is_confirmed == True, Mutation.in_sources(*sources))

        all_mutations_count += (
            base_query.filter(mutations_filters).distinct().count())

        ptm_mutations_count += (base_query.filter(
            and_(Mutation.precomputed_is_ptm,
                 mutations_filters)).distinct().count())

    print(all_mutations_count, ptm_mutations_count,
          ptm_mutations_count / all_mutations_count)
    return all_mutations_count, ptm_mutations_count
예제 #22
0
    def test_browse_list(self):

        from miscellaneous import make_named_temp_file
        from test_imports.test_gene_list import raw_gene_list
        from imports.protein_data import active_driver_gene_lists as load_active_driver_gene_lists

        filename = make_named_temp_file(raw_gene_list)

        # create gene list and genes
        with self.app.app_context():
            from imports.protein_data import ListData
            gene_lists = load_active_driver_gene_lists(lists=(
                ListData(name='TCGA', path=filename, mutations_source=TCGAMutation),
            ))
        db.session.add_all(gene_lists)

        # create preferred isoforms for genes
        for i, gene in enumerate(Gene.query.all()):
            # at least one mutation is required for gene on a gene list to be displayed
            mut = Mutation()
            MC3Mutation(mutation=mut)

            p = Protein(refseq='NM_000%s' % i, mutations=[mut])
            gene.isoforms = [p]
            gene.preferred_isoform = p

        # check the static template
        response = self.client.get('/gene/list/TCGA')
        assert response.status_code == 200
        assert b'TCGA' in response.data

        # check the dynamic data
        response = self.client.get('/gene/list_data/TCGA?order=asc')
        assert response.status_code == 200

        gene_list = GeneList.query.filter_by(name='TCGA').one()

        # all results retrieved
        assert response.json['total'] == len(gene_list.entries)

        # properly sorted by fdr
        fdrs = [row['fdr'] for row in response.json['rows']]
        assert fdrs == sorted(fdrs)
예제 #23
0
def create_test_models():
    protein = Protein(refseq='NM_0001',
                      gene=Gene(name='SOMEGENE'),
                      sequence='ABCD')
    mutation = Mutation(protein=protein, position=1, alt='E')

    MC3Mutation(mutation=mutation,
                cancer=Cancer(code='CAN'),
                samples='Some sample')
    InheritedMutation(
        mutation=mutation,
        clin_data=[ClinicalData(disease=Disease(name='Some disease'))])

    protein_kinase = Protein(refseq='NM_0002',
                             gene=Gene(name='OTHERGENE'),
                             sequence='ABCD')
    kinase = Kinase(name='Kinase name', protein=protein_kinase)
    site = Site(protein=protein, position=1, residue='A', kinases=[kinase])
    protein.sites = [site]

    return locals()
예제 #24
0
    def __init__(self,
                 protein,
                 filter_manager,
                 include_kinases_from_groups=False):
        super().__init__(protein, filter_manager, include_kinases_from_groups)

        sites, kinases, kinase_groups = self.get_sites_and_kinases(
            only_sites_with_kinases=False)

        tracks = prepare_tracks(protein, self.protein_mutations)

        source = filter_manager.get_value('Mutation.sources')
        source_model = Mutation.get_source_model(source)
        value_type = source_model.value_type

        parsed_mutations = self.represent_needles()

        self.json_data = {
            'value_type': value_type,
            'log_scale': (value_type == 'frequency'),
            'mutations': parsed_mutations,
            'sites': prepare_sites(sites),
            'tracks': tracks
        }
예제 #25
0
    def test_impact_on_specific_ptm(self):

        # case 0: there are no sites in the protein

        mutations = {
            Mutation(position=10): 'none',  # too far away
            Mutation(position=9): 'none',
            Mutation(position=8): 'distal',
            Mutation(position=4): 'distal',
            Mutation(position=3): 'proximal',
            Mutation(position=2): 'proximal',
            Mutation(position=1): 'direct'
        }

        protein = Protein(refseq='NM_00001', mutations=mutations.keys())

        db.session.add(protein)

        for mutation in mutations.keys():
            assert mutation.impact_on_ptm() == 'none'

        # case 1: there are some sites in the protein

        protein.sites = [Site(position=1), Site(position=50)]
        site = protein.sites[0]

        for mutation, impact in mutations.items():
            print(mutation)
            assert mutation.impact_on_ptm() == impact
            assert mutation.impact_on_specific_ptm(site) == impact

        # case 2: there are some sites but all will be excluded by a site filter

        def site_filter(sites):
            return []

        for mutation in mutations.keys():
            assert mutation.impact_on_ptm(site_filter=site_filter) == 'none'
예제 #26
0
 def count_by_sources(sources: List[MutationSource]):
     return Mutation.query.filter(Mutation.in_sources(*sources)).count()
예제 #27
0
 def confirmed_with_mimp(self):
     return Mutation.query.filter(
         and_(
             Mutation.in_sources(models.MIMPMutation),
             Mutation.is_confirmed,
         )).count()
예제 #28
0
    def test_autocomplete_all(self):

        # MC3 GeneList is required as a target (a href for links) where users will be pointed
        # after clicking of cancer autocomplete suggestion. Likewise with the ClinVar list.
        db.session.add_all([
            GeneList(name=name, mutation_source_name=detail_class.name)
            for name, detail_class in [
                ('TCGA', MC3Mutation), ('ClinVar', InheritedMutation)
            ]
        ])

        g = Gene(name='BR')
        p = Protein(id=1, refseq='NM_007', gene=g, sequence='XXXXXV')
        g.preferred_isoform = p     # required for gene search to work - genes without preferred isoforms are ignored
        mut = Mutation(protein=p, position=6, alt='E')
        db.session.add_all([mut, p, g])

        def autocomplete(query):
            r = self.client.get('/search/autocomplete_all/?q=' + query)
            self.visit_returned_urls(r)
            return r

        from database import bdb_refseq, bdb
        bdb_refseq['BR V6E'] = [p.id]  # required for mutation search
        bdb.add_genomic_mut('1', 10000, 'T', 'C', mut)

        # Gene and mutations

        response = autocomplete('BR V6E')
        entry = get_entry_and_check_type(response, 'aminoacid mutation')
        assert entry

        response = autocomplete('BR V6')
        entry = get_entry_and_check_type(response, 'message')
        assert 'Awaiting for <code>{alt}</code>' in entry['name']

        response = autocomplete('BR V')
        entry = get_entry_and_check_type(response, 'message')
        assert 'Awaiting for <code>{pos}{alt}</code>' in entry['name']

        response = autocomplete('B')
        entry = get_entry_and_check_type(response, 'gene')
        assert 'BR' == entry['name']

        # genomic mutation
        response = autocomplete('chr1 10000 T C')
        entry = get_entry_and_check_type(response, 'nucleotide mutation')
        assert entry and entry['input'] == 'CHR1 10000 T C'

        # is the search falling back to the other strand?
        response = autocomplete('chr1 10000 A G')
        entry = get_entry_and_check_type(response, 'nucleotide mutation')
        assert entry and entry['input'] == 'complement of CHR1 10000 A G'

        prompt = 'Awaiting for mutation in <code>{chrom} {pos} {ref} {alt}</code> format'

        for prompt_invoking_query in ['chr1', 'chr1 ', 'chr1 40', 'chr1 40 ', 'chr1 40 T']:
            response = autocomplete(prompt_invoking_query)
            entry = get_entry_and_check_type(response, 'message')
            assert entry['name'] == prompt

        # Pathways

        pathways = [
            Pathway(description='Activation of RAS in B cells', reactome=1169092),
            Pathway(description='abortive mitotic cell cycle', gene_ontology=33277),
            Pathway(description='amacrine cell differentiation', gene_ontology=35881),
            Pathway(description='amniotic stem cell differentiation', gene_ontology=97086)
        ]

        db.session.add_all(pathways)

        # test partial matching and Reactome id pathways search
        for ras_activation_query in ['Activation', 'REAC:1', 'REAC:1169092']:
            response = autocomplete(ras_activation_query)
            entry = get_entry_and_check_type(response, 'pathway')
            assert entry['name'].startswith('Activation of RAS in B cells')

        # test Gene Ontology search:
        response = autocomplete('GO:33')
        go_pathway = get_entry_and_check_type(response, 'pathway')
        assert go_pathway['name'] == 'abortive mitotic cell cycle (GO:33277)'

        # check if multiple pathways are returned
        response = autocomplete('differentiation')
        assert len(response.json['entries']) == 2

        # check if both genes an pathways are returned simultaneously
        # there should be: a pathway ('a>b<ortive...') and the >B<R gene
        response = autocomplete('b')
        entries = response.json['entries']
        names = [entry['name'] for entry in entries]
        assert all([name in names for name in ['BR', 'abortive mitotic cell cycle']])

        # check if "search more pathways" is displayed
        response = autocomplete('cell')    # cell occurs in all four of added pathways;
        # as a limit of pathways shown is 3, we should get a "show more" link
        links = entries_with_type(response, 'see_more')
        assert len(links) == 1
        assert links[0]['name'] == 'Show all pathways matching <i>cell</i>'

        # test case insensitive text search
        response = autocomplete('AMNIOTIC STEM')
        pathways = entries_with_type(response, 'pathway')
        assert len(pathways) == 1
        assert pathways[0]['name'] == 'amniotic stem cell differentiation'

        # Disease
        disease_names = [
            'Cystic fibrosis', 'Polycystic kidney disease 2',
            'Frontotemporal dementia', 'Cataract, nuclear total'
        ]
        diseases = {name: Disease(name=name) for name in disease_names}
        db.session.add_all(diseases.values())

        response = autocomplete('cystic')
        cystic_matching = entries_with_type(response, 'disease')
        # both 'Cystic fibrosis' and PKD2 should match
        assert len(cystic_matching) == 2

        # is comma containing disease name properly linked?
        response = autocomplete('Cataract')
        cataract = get_entry_and_check_type(response, 'disease')
        assert cataract['name'] == 'Cataract, nuclear total'

        # Gene mutation in disease

        # test suggestions
        response = autocomplete('cystic ')
        entry = entries_with_type(response, 'message')[0]
        assert re.match('Do you wish to search for (.*?) mutations\?', entry['name'])

        # currently there are no mutations associated with any disease
        # so the auto-completion should not return any results
        response = autocomplete('cystic in ')
        assert not response.json['entries']

        # let's add a mutation
        m = Mutation(protein=p, position=1, alt='Y')
        bdb_refseq['BR X1Y'] = ['NM_007']
        # note: sig_code is required here
        data = ClinicalData(disease=diseases['Cystic fibrosis'], sig_code=1)
        disease_mutation = InheritedMutation(mutation=m, clin_data=[data])
        db.session.add_all([m, data, disease_mutation])

        # should return '.. in BR' suggestion now.
        for query in ['cystic in', 'cystic in ']:
            response = autocomplete(query)
            result = get_entry_and_check_type(response, 'disease_in_protein')
            assert result['gene'] == 'BR'
            assert result['name'] == 'Cystic fibrosis'

        # both gene search and refseq search should yield the same, non-empty results
        results = []

        for query in ['cystic in BR', 'cystic in NM_007', 'cystic in 007']:
            response = autocomplete(query)
            result = get_entry_and_check_type(response, 'disease_in_protein')
            results.append(result)

        assert all(r == result for r in results) and result
예제 #29
0
def source_specific_nucleotide_mappings():
    from database import bdb
    from genomic_mappings import decode_csv
    from models import Mutation
    from tqdm import tqdm
    from gc import collect

    mutations = defaultdict(str)

    def count_mutations(mutations_query):
        for mutation in tqdm(mutations_query, total=mutations_query.count()):
            mutations[str(mutation[0]) + mutation[1] + str(mutation[2])] += i

    sources_map = {str(i): model for i, model in enumerate(mutation_sources().values())}

    print('Loading mutations from sources:')
    for i, model in tqdm(sources_map.items(), total=len(sources_map)):
        query = (
            db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position)
            .filter(Mutation.in_sources(model))
            # no need for '.filter(Mutation.is_confirmed==True)'
            # (if it is in source of interest, it is confirmed - we do not count MIMPs here)
            .yield_per(5000)
        )
        count_mutations(query)

    # add merged
    i = str(len(sources_map))
    sources_map[i] = 'merged'
    print('Loading merged mutations:')

    query = (
        db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position)
        .filter(Mutation.is_confirmed == True)
        .yield_per(5000)
    )
    count_mutations(query)

    print('Mutations loaded')
    collect()

    def iterate_known_muts_sources():
        for value in tqdm(bdb.values(), total=len(bdb.db)):
            for item in map(decode_csv, value):
                sources = mutations.get(str(item['protein_id']) + item['alt'] + str(item['pos']))
                if sources:
                    yield sources

    counts = defaultdict(int)
    fields_ids = [source_id for source_id in sources_map.keys()]

    for sources in iterate_known_muts_sources():
        for field in fields_ids:
            if field in sources:
                counts[field] += 1

    return {
        'Nucleotide mappings': {
            sources_map[key]: value
            for key, value in counts.items()
        }
    }
예제 #30
0
    def test_mutation(self):

        s = Site(position=13, types={SiteType(name='methylation')})
        p = Protein(refseq='NM_007',
                    id=1,
                    sites=[s],
                    sequence='A' * 15,
                    gene=Gene(name='SomeGene'))

        db.session.add(p)

        from database import bdb

        muts = {13: 14370, 15: 14376}

        for aa_pos, dna_pos in muts.items():
            muts[aa_pos] = Mutation(protein=p, position=aa_pos, alt='V')
            bdb.add_genomic_mut('20',
                                dna_pos,
                                'G',
                                'A',
                                muts[aa_pos],
                                is_ptm=True)

        query_url = '/chromosome/mutation/{chrom}/{pos}/{ref}/{alt}'

        # query as a novel mutation
        response = self.client.get(
            query_url.format(chrom='chr20', pos=14370, ref='G', alt='A'))

        assert response.status_code == 200
        assert response.json == [{
            'alt':
            'V',
            'gene':
            'SomeGene',
            'in_datasets': {},
            'pos':
            13,
            'ptm_impact':
            'direct',
            'cnt_ptm':
            1,
            'closest_sites': ['13 A'],
            'protein':
            'NM_007',
            'sites': [{
                'kinases': [],
                'position': 13,
                'residue': 'A',
                'kinase_groups': [],
                'type': 'methylation'
            }],
            'ref':
            'A'
        }]

        # well let's look on a known mutation:
        m = muts[15]
        mc3 = MC3Mutation(mutation=m,
                          cancer=Cancer(name='Breast invasive carcinoma',
                                        code='BRCA'),
                          count=1)
        esp = ExomeSequencingMutation(mutation=m, maf_all=0.02, maf_aa=0.02)

        db.session.add_all([m, mc3, esp])
        db.session.commit()

        mutation_a15v_query = query_url.format(chrom='chr20',
                                               pos=14376,
                                               ref='G',
                                               alt='A')
        response = self.client.get(mutation_a15v_query)

        metadata = {
            'MC3': {
                'Cancers': [{
                    'Cancer': 'Breast invasive carcinoma',
                    'Value': 1
                }]
            },
            'ESP6500': {
                'MAF': 0.02,
                'MAF AA': 0.02,
                'MAF EA': None
            }
        }

        assert response.json[0]['in_datasets'] == metadata

        expected_values = {'MC3': 1, 'ESP6500': 0.02}

        # if user does not want to download data for all datasets he may use:
        for source, meta in metadata.items():
            response = self.client.get(mutation_a15v_query +
                                       '?filters=Mutation.sources:in:' +
                                       source)
            json = response.json[0]
            assert json['in_datasets'] == {source: meta}
            assert json['value'] == expected_values[source]

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:MC3;Mutation.mc3_cancer_code:in:BRCA'
        )
        assert response.json

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:African American'
        )
        assert response.json

        response = self.client.get(
            mutation_a15v_query +
            '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:European American'
        )
        assert not response.json