def test_divide_muts_by_sites(self): from views.network import divide_muts_by_sites # check if null case works divide_muts_by_sites([], []) # one site s_1 = Site(position=1) muts_by_sites = divide_muts_by_sites([], [s_1]) assert muts_by_sites[s_1] == [] # full test s_2 = Site(position=10) s_3 = Site(position=20) muts_by_pos = { pos: [Mutation(position=pos)] for pos in (1, 2, 8, 14, 16, 30) } muts_by_pos[16].append(Mutation(position=16)) def get_muts_from_pos(*positions): lists = [muts_by_pos[p] for p in positions] return [mut for mut_list in lists for mut in mut_list] muts_by_sites = divide_muts_by_sites([ mut for muts_on_pos_x in muts_by_pos.values() for mut in muts_on_pos_x ], [s_1, s_2, s_3]) assert muts_by_sites[s_1] == get_muts_from_pos(1, 2, 8) assert muts_by_sites[s_2] == get_muts_from_pos(8, 14, 16) assert muts_by_sites[s_3] == get_muts_from_pos(14, 16)
def create_network(): p = create_test_protein() cancer = Cancer(name='Ovarian', code='OV') known_interactor_of_x = create_test_kinase('Kinase Y', 'NM_0009') kinase_mutation = Mutation(position=1, alt='T', meta_MC3=[MC3Mutation(cancer=cancer)]) known_interactor_of_x.protein.mutations = [kinase_mutation] drug = Drug( name='Drug targeting ' + known_interactor_of_x.name, drug_bank_id='DB01', target_genes=[known_interactor_of_x.protein.gene], # by default only approved drugs are shown groups={DrugGroup(name='approved')}) group = KinaseGroup(name='Group of kinases', ) s = Site(position=1, type='phosphorylation', residue='T', kinases=[known_interactor_of_x], kinase_groups=[group]) s2 = Site(position=2, type='phosphorylation', residue='R', kinase_groups=[group]) p.sites = [s, s2] predicted_interactor = create_test_kinase('Kinase Z', 'NM_0002') protein_mutation = Mutation(position=2, alt='T', meta_MC3=[MC3Mutation(cancer=cancer)], meta_MIMP=[ MIMPMutation( pwm=known_interactor_of_x.name, effect='loss', site=s, probability=0.1, position_in_motif=1), MIMPMutation(pwm=predicted_interactor.name, effect='gain', site=s, probability=0.1, position_in_motif=1) ]) p.mutations = [protein_mutation] db.session.add_all([p, drug, predicted_interactor]) db.session.commit() # a new cancer was added, reload is necessary (this should not happen during normal app usage) from website.views.filters import cached_queries cached_queries.reload()
def test_search_mutations(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=7, sites=[s], sequence='XXXXXXXXXXXXV') m_in_site = Mutation(protein=p, position=13, alt='V') m_out_site = Mutation(protein=p, position=50, alt='K') db.session.add(p) # points to the same location as first record in VCF_FILE_CONTENT test_query = 'chr20 14370 G A' from database import bdb # map the first genomic mutation from VCF_FILE_CONTENT # to some (mocked) protein mutation bdb.add_genomic_mut('20', 14370, 'G', 'A', m_in_site, is_ptm=True) # # basic test - is appropriate mutation in results? # response = self.search_mutations(mutations=test_query) assert response.status_code == 200 # this mutation is exactly at a PTM site and should be included in results assert '<td>{0}</td>'.format(m_in_site.alt).encode() in response.data # this mutation lies outside of a PTM site - be default should be filtered out assert '<td>{0}</td>'.format(m_out_site.alt).encode() not in response.data # # count test - is mutation for this query annotated as shown twice? # response = self.search_mutations( mutations='{0}\n{0}'.format(test_query) ) assert response.status_code == 200 assert b'<td>2</td>' in response.data # # VCF file test # response = self.client.post( '/search/mutations', content_type='multipart/form-data', data={ 'vcf-file': (BytesIO(VCF_FILE_CONTENT), 'exemplar_vcf.vcf') } ) assert response.status_code == 200 assert b'NM_007' in response.data
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') protein.gene.preferred_isoform = protein MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Sample A,Sample B', count=2) InheritedMutation(mutation=mutation, clin_data=[ ClinicalData(disease=Disease(name='Some disease'), sig_code=5), ClinicalData(disease=Disease(name='Other disease'), sig_code=2) ]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases={kinase}, pmid={1, 2}, types={SiteType(name='glycosylation')}) protein.sites = [site] return locals()
def test_prepare_dataset(self): from views.mutation import prepare_datasets p = Protein(refseq='NM_000123', sequence='TRAN', gene=Gene(name='TP53')) mutation = Mutation(protein=p, position=2, alt='K') details = MC3Mutation(mutation=mutation, count=2) db.session.add(mutation) datasets, user_datasets = prepare_datasets(mutation) expected_datasets = [{ 'filter': 'Mutation.sources:in:' + source.name, 'name': source.display_name, 'mutation_present': False } if source is not MC3Mutation else { 'filter': 'Mutation.sources:in:' + MC3Mutation.name, 'name': MC3Mutation.display_name, 'mutation_present': [details] } for source in source_manager.confirmed] assert datasets == expected_datasets assert not user_datasets
def test_sites(self): mutations = [Mutation(position=x) for x in (0, 5, 12, 57)] protein = Protein(refseq='NM_00002', mutations=mutations, sites=[Site(position=x) for x in (10, 14, 15, 57)]) db.session.add(protein) db.session.commit() # ==test_find_closest_sites== # for mutation at position 0 there is no closest site; # for mutation at position 5 there should be 1 closest site expected_closest_sites = dict(zip(mutations, [0, 1, 2, 1])) for mutation, expected_sites_cnt in expected_closest_sites.items(): sites_found = mutation.find_closest_sites() assert len(sites_found) == expected_sites_cnt # ==test_get_affected_ptm_sites== expected_affected_sites = dict(zip(mutations, [0, 1, 3, 1])) for mutation, expected_sites_cnt in expected_affected_sites.items(): sites_found = mutation.get_affected_ptm_sites() assert len(sites_found) == expected_sites_cnt
def test_mutate(self): p = Protein(sequence='ABCDE') s = Site(protein=p, position=3, residue='C') cases = { 1: 'XBCDE', 3: 'ABXDE', } for position, expected_seq in cases.items(): m = Mutation(protein=p, position=position, alt='X') assert mutate_sequence(s, m, offset=2) == expected_seq
def test_impact_on_ptm(self): mutations = [Mutation(position=61)] protein = Protein(refseq='NM_00001', mutations=mutations) db.session.add(protein) protein.sites = [ Site(position=61), Site(position=54), Site(position=51) ] mutation = mutations[0] assert mutation.impact_on_ptm() == 'direct'
def test_show(self): p = Protein(refseq='NM_000123', sequence='TRAN', gene=Gene(name='TP53')) mutation = Mutation(protein=p, position=2, alt='K') db.session.add(mutation) response = self.client.get('/mutation/show/NM_000123/2/K') assert response.status_code == 200 assert b'TP53' in response.data assert b'NM_000123' in response.data
def test_browse_list(self): from miscellaneous import make_named_temp_file from test_imports.test_gene_list import raw_gene_list from imports.protein_data import active_driver_gene_lists as load_active_driver_gene_lists filename = make_named_temp_file(raw_gene_list) # create gene list and genes with self.app.app_context(): from imports.protein_data import ListData gene_lists = load_active_driver_gene_lists(lists=( ListData(name='TCGA', path=filename, mutations_source=TCGAMutation), )) db.session.add_all(gene_lists) # create preferred isoforms for genes for i, gene in enumerate(Gene.query.all()): # at least one mutation is required for gene on a gene list to be displayed mut = Mutation() MC3Mutation(mutation=mut) p = Protein(refseq='NM_000%s' % i, mutations=[mut]) gene.isoforms = [p] gene.preferred_isoform = p # check the static template response = self.client.get('/gene/list/TCGA') assert response.status_code == 200 assert b'TCGA' in response.data # check the dynamic data response = self.client.get('/gene/list_data/TCGA?order=asc') assert response.status_code == 200 gene_list = GeneList.query.filter_by(name='TCGA').one() # all results retrieved assert response.json['total'] == len(gene_list.entries) # properly sorted by fdr fdrs = [row['fdr'] for row in response.json['rows']] assert fdrs == sorted(fdrs)
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Some sample') InheritedMutation( mutation=mutation, clin_data=[ClinicalData(disease=Disease(name='Some disease'))]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases=[kinase]) protein.sites = [site] return locals()
def test_impact_on_specific_ptm(self): # case 0: there are no sites in the protein mutations = { Mutation(position=10): 'none', # too far away Mutation(position=9): 'none', Mutation(position=8): 'distal', Mutation(position=4): 'distal', Mutation(position=3): 'proximal', Mutation(position=2): 'proximal', Mutation(position=1): 'direct' } protein = Protein(refseq='NM_00001', mutations=mutations.keys()) db.session.add(protein) for mutation in mutations.keys(): assert mutation.impact_on_ptm() == 'none' # case 1: there are some sites in the protein protein.sites = [Site(position=1), Site(position=50)] site = protein.sites[0] for mutation, impact in mutations.items(): print(mutation) assert mutation.impact_on_ptm() == impact assert mutation.impact_on_specific_ptm(site) == impact # case 2: there are some sites but all will be excluded by a site filter def site_filter(sites): return [] for mutation in mutations.keys(): assert mutation.impact_on_ptm(site_filter=site_filter) == 'none'
def mut(pos): return Mutation(position=pos, alt='X', protein=p)
def test_autocomplete_all(self): # MC3 GeneList is required as a target (a href for links) where users will be pointed # after clicking of cancer autocomplete suggestion. Likewise with the ClinVar list. db.session.add_all([ GeneList(name=name, mutation_source_name=detail_class.name) for name, detail_class in [ ('TCGA', MC3Mutation), ('ClinVar', InheritedMutation) ] ]) g = Gene(name='BR') p = Protein(id=1, refseq='NM_007', gene=g, sequence='XXXXXV') g.preferred_isoform = p # required for gene search to work - genes without preferred isoforms are ignored mut = Mutation(protein=p, position=6, alt='E') db.session.add_all([mut, p, g]) def autocomplete(query): r = self.client.get('/search/autocomplete_all/?q=' + query) self.visit_returned_urls(r) return r from database import bdb_refseq, bdb bdb_refseq['BR V6E'] = [p.id] # required for mutation search bdb.add_genomic_mut('1', 10000, 'T', 'C', mut) # Gene and mutations response = autocomplete('BR V6E') entry = get_entry_and_check_type(response, 'aminoacid mutation') assert entry response = autocomplete('BR V6') entry = get_entry_and_check_type(response, 'message') assert 'Awaiting for <code>{alt}</code>' in entry['name'] response = autocomplete('BR V') entry = get_entry_and_check_type(response, 'message') assert 'Awaiting for <code>{pos}{alt}</code>' in entry['name'] response = autocomplete('B') entry = get_entry_and_check_type(response, 'gene') assert 'BR' == entry['name'] # genomic mutation response = autocomplete('chr1 10000 T C') entry = get_entry_and_check_type(response, 'nucleotide mutation') assert entry and entry['input'] == 'CHR1 10000 T C' # is the search falling back to the other strand? response = autocomplete('chr1 10000 A G') entry = get_entry_and_check_type(response, 'nucleotide mutation') assert entry and entry['input'] == 'complement of CHR1 10000 A G' prompt = 'Awaiting for mutation in <code>{chrom} {pos} {ref} {alt}</code> format' for prompt_invoking_query in ['chr1', 'chr1 ', 'chr1 40', 'chr1 40 ', 'chr1 40 T']: response = autocomplete(prompt_invoking_query) entry = get_entry_and_check_type(response, 'message') assert entry['name'] == prompt # Pathways pathways = [ Pathway(description='Activation of RAS in B cells', reactome=1169092), Pathway(description='abortive mitotic cell cycle', gene_ontology=33277), Pathway(description='amacrine cell differentiation', gene_ontology=35881), Pathway(description='amniotic stem cell differentiation', gene_ontology=97086) ] db.session.add_all(pathways) # test partial matching and Reactome id pathways search for ras_activation_query in ['Activation', 'REAC:1', 'REAC:1169092']: response = autocomplete(ras_activation_query) entry = get_entry_and_check_type(response, 'pathway') assert entry['name'].startswith('Activation of RAS in B cells') # test Gene Ontology search: response = autocomplete('GO:33') go_pathway = get_entry_and_check_type(response, 'pathway') assert go_pathway['name'] == 'abortive mitotic cell cycle (GO:33277)' # check if multiple pathways are returned response = autocomplete('differentiation') assert len(response.json['entries']) == 2 # check if both genes an pathways are returned simultaneously # there should be: a pathway ('a>b<ortive...') and the >B<R gene response = autocomplete('b') entries = response.json['entries'] names = [entry['name'] for entry in entries] assert all([name in names for name in ['BR', 'abortive mitotic cell cycle']]) # check if "search more pathways" is displayed response = autocomplete('cell') # cell occurs in all four of added pathways; # as a limit of pathways shown is 3, we should get a "show more" link links = entries_with_type(response, 'see_more') assert len(links) == 1 assert links[0]['name'] == 'Show all pathways matching <i>cell</i>' # test case insensitive text search response = autocomplete('AMNIOTIC STEM') pathways = entries_with_type(response, 'pathway') assert len(pathways) == 1 assert pathways[0]['name'] == 'amniotic stem cell differentiation' # Disease disease_names = [ 'Cystic fibrosis', 'Polycystic kidney disease 2', 'Frontotemporal dementia', 'Cataract, nuclear total' ] diseases = {name: Disease(name=name) for name in disease_names} db.session.add_all(diseases.values()) response = autocomplete('cystic') cystic_matching = entries_with_type(response, 'disease') # both 'Cystic fibrosis' and PKD2 should match assert len(cystic_matching) == 2 # is comma containing disease name properly linked? response = autocomplete('Cataract') cataract = get_entry_and_check_type(response, 'disease') assert cataract['name'] == 'Cataract, nuclear total' # Gene mutation in disease # test suggestions response = autocomplete('cystic ') entry = entries_with_type(response, 'message')[0] assert re.match('Do you wish to search for (.*?) mutations\?', entry['name']) # currently there are no mutations associated with any disease # so the auto-completion should not return any results response = autocomplete('cystic in ') assert not response.json['entries'] # let's add a mutation m = Mutation(protein=p, position=1, alt='Y') bdb_refseq['BR X1Y'] = ['NM_007'] # note: sig_code is required here data = ClinicalData(disease=diseases['Cystic fibrosis'], sig_code=1) disease_mutation = InheritedMutation(mutation=m, clin_data=[data]) db.session.add_all([m, data, disease_mutation]) # should return '.. in BR' suggestion now. for query in ['cystic in', 'cystic in ']: response = autocomplete(query) result = get_entry_and_check_type(response, 'disease_in_protein') assert result['gene'] == 'BR' assert result['name'] == 'Cystic fibrosis' # both gene search and refseq search should yield the same, non-empty results results = [] for query in ['cystic in BR', 'cystic in NM_007', 'cystic in 007']: response = autocomplete(query) result = get_entry_and_check_type(response, 'disease_in_protein') results.append(result) assert all(r == result for r in results) and result
def test_counting(self): motifs_db = { # xation happens whenever there is X which is not preceded with or followed by another X 'xation': { 'canonical': '.{6}[^X]X[^X].{6}', 'non-canonical': 'XXY' } } p = Protein(refseq='NM_007', id=1, sequence='_X_X_______X________XXY') mutations = [ Mutation(protein=p, position=1, alt='X'), # proximal, breaking Mutation(protein=p, position=1, alt='o'), # proximal, non-breaking Mutation(protein=p, position=2, alt='Y'), # direct, breaking Mutation(protein=p, position=3, alt='X'), # proximal for two sites, breaking ] xation = SiteType(name='xation') canonical_sites = [ Site(protein=p, position=2, types={xation}), # canonical, seriously mutated and broken Site(protein=p, position=4, types={xation}), # canonical, mutated Site(protein=p, position=12, types={xation}), # canonical, not mutated ] other_sites = [ Site(protein=p, position=22, types={xation}), # non-canonical motif, not mutated ] all_sites = canonical_sites + other_sites db.session.add(p) db.session.commit() counter = MotifsCounter(xation, motifs_db=motifs_db) counts = counter.count_muts_and_sites(Mutation.query, Site.query) assert counts.muts_around_sites_with_motif['canonical'] == 4 assert counts.muts_breaking_sites_motif['canonical'] == 3 assert counts.sites_with_broken_motif['canonical'] == 2 assert counts.sites_with_motif['canonical'] == len(canonical_sites) assert counts.sites_with_broken_motif['non-canonical'] == 0 assert counts.muts_around_sites_with_motif['non-canonical'] == 0 x_motifs = motifs_db['xation'] selection = select_sites_with_motifs(Site.query, x_motifs) assert selection['canonical'] == set(canonical_sites) assert select_sites_with_motifs(all_sites, x_motifs) == selection data = counter.gather_muts_and_sites(Mutation.query, Site.query) assert data.sites_with_broken_motif['canonical'] == { canonical_sites[0], canonical_sites[1] } assert data.sites_with_motif['canonical'] == set(canonical_sites)
def test_mutation(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=1, sites=[s], sequence='A' * 15, gene=Gene(name='SomeGene')) db.session.add(p) from database import bdb muts = {13: 14370, 15: 14376} for aa_pos, dna_pos in muts.items(): muts[aa_pos] = Mutation(protein=p, position=aa_pos, alt='V') bdb.add_genomic_mut('20', dna_pos, 'G', 'A', muts[aa_pos], is_ptm=True) query_url = '/chromosome/mutation/{chrom}/{pos}/{ref}/{alt}' # query as a novel mutation response = self.client.get( query_url.format(chrom='chr20', pos=14370, ref='G', alt='A')) assert response.status_code == 200 assert response.json == [{ 'alt': 'V', 'gene': 'SomeGene', 'in_datasets': {}, 'pos': 13, 'ptm_impact': 'direct', 'cnt_ptm': 1, 'closest_sites': ['13 A'], 'protein': 'NM_007', 'sites': [{ 'kinases': [], 'position': 13, 'residue': 'A', 'kinase_groups': [], 'type': 'methylation' }], 'ref': 'A' }] # well let's look on a known mutation: m = muts[15] mc3 = MC3Mutation(mutation=m, cancer=Cancer(name='Breast invasive carcinoma', code='BRCA'), count=1) esp = ExomeSequencingMutation(mutation=m, maf_all=0.02, maf_aa=0.02) db.session.add_all([m, mc3, esp]) db.session.commit() mutation_a15v_query = query_url.format(chrom='chr20', pos=14376, ref='G', alt='A') response = self.client.get(mutation_a15v_query) metadata = { 'MC3': { 'Cancers': [{ 'Cancer': 'Breast invasive carcinoma', 'Value': 1 }] }, 'ESP6500': { 'MAF': 0.02, 'MAF AA': 0.02, 'MAF EA': None } } assert response.json[0]['in_datasets'] == metadata expected_values = {'MC3': 1, 'ESP6500': 0.02} # if user does not want to download data for all datasets he may use: for source, meta in metadata.items(): response = self.client.get(mutation_a15v_query + '?filters=Mutation.sources:in:' + source) json = response.json[0] assert json['in_datasets'] == {source: meta} assert json['value'] == expected_values[source] response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:MC3;Mutation.mc3_cancer_code:in:BRCA' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:African American' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:European American' ) assert not response.json
def test_default_ref(self): p = Protein(sequence='ABC') m = Mutation(position=1, protein=p) db.session.add(p) db.session.commit() assert m.ref == 'A'