def test_divide_muts_by_sites(self): from views.network import divide_muts_by_sites # check if null case works divide_muts_by_sites([], []) # one site s_1 = Site(position=1) muts_by_sites = divide_muts_by_sites([], [s_1]) assert muts_by_sites[s_1] == [] # full test s_2 = Site(position=10) s_3 = Site(position=20) muts_by_pos = { pos: [Mutation(position=pos)] for pos in (1, 2, 8, 14, 16, 30) } muts_by_pos[16].append(Mutation(position=16)) def get_muts_from_pos(*positions): lists = [muts_by_pos[p] for p in positions] return [mut for mut_list in lists for mut in mut_list] muts_by_sites = divide_muts_by_sites([ mut for muts_on_pos_x in muts_by_pos.values() for mut in muts_on_pos_x ], [s_1, s_2, s_3]) assert muts_by_sites[s_1] == get_muts_from_pos(1, 2, 8) assert muts_by_sites[s_2] == get_muts_from_pos(8, 14, 16) assert muts_by_sites[s_3] == get_muts_from_pos(14, 16)
def get_filter_by_sources(sources): filters = and_(((Mutation.get_relationship(source).any() if are_details_managed(source) else Mutation.get_relationship(source).has()) for source in sources)) return filters
def create_network(): p = create_test_protein() cancer = Cancer(name='Ovarian', code='OV') known_interactor_of_x = create_test_kinase('Kinase Y', 'NM_0009') kinase_mutation = Mutation(position=1, alt='T', meta_MC3=[MC3Mutation(cancer=cancer)]) known_interactor_of_x.protein.mutations = [kinase_mutation] drug = Drug( name='Drug targeting ' + known_interactor_of_x.name, drug_bank_id='DB01', target_genes=[known_interactor_of_x.protein.gene], # by default only approved drugs are shown groups={DrugGroup(name='approved')}) group = KinaseGroup(name='Group of kinases', ) s = Site(position=1, type='phosphorylation', residue='T', kinases=[known_interactor_of_x], kinase_groups=[group]) s2 = Site(position=2, type='phosphorylation', residue='R', kinase_groups=[group]) p.sites = [s, s2] predicted_interactor = create_test_kinase('Kinase Z', 'NM_0002') protein_mutation = Mutation(position=2, alt='T', meta_MC3=[MC3Mutation(cancer=cancer)], meta_MIMP=[ MIMPMutation( pwm=known_interactor_of_x.name, effect='loss', site=s, probability=0.1, position_in_motif=1), MIMPMutation(pwm=predicted_interactor.name, effect='gain', site=s, probability=0.1, position_in_motif=1) ]) p.mutations = [protein_mutation] db.session.add_all([p, drug, predicted_interactor]) db.session.commit() # a new cancer was added, reload is necessary (this should not happen during normal app usage) from website.views.filters import cached_queries cached_queries.reload()
def test_search_mutations(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=7, sites=[s], sequence='XXXXXXXXXXXXV') m_in_site = Mutation(protein=p, position=13, alt='V') m_out_site = Mutation(protein=p, position=50, alt='K') db.session.add(p) # points to the same location as first record in VCF_FILE_CONTENT test_query = 'chr20 14370 G A' from database import bdb # map the first genomic mutation from VCF_FILE_CONTENT # to some (mocked) protein mutation bdb.add_genomic_mut('20', 14370, 'G', 'A', m_in_site, is_ptm=True) # # basic test - is appropriate mutation in results? # response = self.search_mutations(mutations=test_query) assert response.status_code == 200 # this mutation is exactly at a PTM site and should be included in results assert '<td>{0}</td>'.format(m_in_site.alt).encode() in response.data # this mutation lies outside of a PTM site - be default should be filtered out assert '<td>{0}</td>'.format(m_out_site.alt).encode() not in response.data # # count test - is mutation for this query annotated as shown twice? # response = self.search_mutations( mutations='{0}\n{0}'.format(test_query) ) assert response.status_code == 200 assert b'<td>2</td>' in response.data # # VCF file test # response = self.client.post( '/search/mutations', content_type='multipart/form-data', data={ 'vcf-file': (BytesIO(VCF_FILE_CONTENT), 'exemplar_vcf.vcf') } ) assert response.status_code == 200 assert b'NM_007' in response.data
def test_prepare_dataset(self): from views.mutation import prepare_datasets p = Protein(refseq='NM_000123', sequence='TRAN', gene=Gene(name='TP53')) mutation = Mutation(protein=p, position=2, alt='K') details = MC3Mutation(mutation=mutation, count=2) db.session.add(mutation) datasets, user_datasets = prepare_datasets(mutation) expected_datasets = [{ 'filter': 'Mutation.sources:in:' + source.name, 'name': source.display_name, 'mutation_present': False } if source is not MC3Mutation else { 'filter': 'Mutation.sources:in:' + MC3Mutation.name, 'name': MC3Mutation.display_name, 'mutation_present': [details] } for source in source_manager.confirmed] assert datasets == expected_datasets assert not user_datasets
def test_sites(self): mutations = [Mutation(position=x) for x in (0, 5, 12, 57)] protein = Protein(refseq='NM_00002', mutations=mutations, sites=[Site(position=x) for x in (10, 14, 15, 57)]) db.session.add(protein) db.session.commit() # ==test_find_closest_sites== # for mutation at position 0 there is no closest site; # for mutation at position 5 there should be 1 closest site expected_closest_sites = dict(zip(mutations, [0, 1, 2, 1])) for mutation, expected_sites_cnt in expected_closest_sites.items(): sites_found = mutation.find_closest_sites() assert len(sites_found) == expected_sites_cnt # ==test_get_affected_ptm_sites== expected_affected_sites = dict(zip(mutations, [0, 1, 3, 1])) for mutation, expected_sites_cnt in expected_affected_sites.items(): sites_found = mutation.get_affected_ptm_sites() assert len(sites_found) == expected_sites_cnt
def count_by_sources(sources: List[MutationSource], site_type: SiteType, primary_isoforms=True, by_genes=True, genes=None, muts_conjunction=or_, **kwargs): base_query = Mutation.query.filter( muts_conjunction(*[Mutation.in_sources(source) for source in sources])) if primary_isoforms: base_query = base_query.join(Protein).filter( Protein.is_preferred_isoform) sites = Site.query.filter(Site.types.contains(site_type)) counter = MotifsCounter(site_type) if not by_genes: return counter.count_muts_and_sites(base_query, sites, **kwargs) counts_by_genes = {} if not genes: genes = Gene.query.all() for gene in tqdm(genes): query = base_query.filter(Mutation.protein == gene.preferred_isoform) gene_sites = sites.filter(Site.protein == gene.preferred_isoform) counts_by_genes[gene.name] = counter.count_muts_and_sites( query, gene_sites, show_progress=False, **kwargs) return counts_by_genes
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') protein.gene.preferred_isoform = protein MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Sample A,Sample B', count=2) InheritedMutation(mutation=mutation, clin_data=[ ClinicalData(disease=Disease(name='Some disease'), sig_code=5), ClinicalData(disease=Disease(name='Other disease'), sig_code=2) ]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases={kinase}, pmid={1, 2}, types={SiteType(name='glycosylation')}) protein.sites = [site] return locals()
def mutation_sources(): sources = {} for name, source in Mutation.sources_dict.items(): if name == 'user': continue sources[name] = Mutation.get_source_model(name) return sources
def source_specific_mutated_sites(): muts_in_ptm_sites = {} mimp_muts = {} mutated_sites = defaultdict(dict) site_type_queries = [models.SiteType(name='')] # empty will match all sites site_type_queries.extend(models.SiteType.query) for name, model in mutation_sources().items(): count = ( Mutation.query .filter_by(is_confirmed=True, is_ptm_distal=True) .filter(Mutation.in_sources(model)) .count() ) muts_in_ptm_sites[name] = count mimp_muts[name] = ( Mutation.query .filter( and_( Mutation.in_sources(models.MIMPMutation, model), Mutation.is_confirmed, ) ).count() ) for site_type in tqdm(site_type_queries): mutated_sites[name][site_type] = count_mutated_sites([site_type], model) all_mutated_sites = {} for site_type in tqdm(site_type_queries): all_mutated_sites[site_type] = count_mutated_sites([site_type]) mutated_sites['merged'] = all_mutated_sites return { 'Mutations - in PTM sites': muts_in_ptm_sites, 'Mutations - with network-rewiring effect': mimp_muts, 'PTM sites affected by mutations': mutated_sites }
def with_significant_genes(self, significant_gene_list_name): query = request.args.get('query', '') gene_list = GeneList.query.filter_by( name=significant_gene_list_name).first_or_404() dataset = Mutation.get_source_model(gene_list.mutation_source_name) return template('pathway/significant.html', gene_list=gene_list, dataset=dataset, endpoint='significant_data', endpoint_kwargs={'gene_list_id': gene_list.id}, query=query)
def get_genes_with_mutations_from_sources(sources, only_genes_with_ptm_sites=False): query = (db.session.query(Gene).join( Protein, Gene.preferred_isoform_id == Protein.id).join(Mutation)) query = query.filter(Mutation.in_sources(*sources)) genes = set(query.distinct()) if only_genes_with_ptm_sites: return {gene for gene in genes if gene.preferred_isoform.sites} return genes
def mutated_ptm_sites_in_proximity(mutation_source, type_1: str, type_2: str, mutation_filter=True, distance: int = 7, only_preferred=True) -> int: sites, (site_1, site_2) = ptm_sites_in_proximity(type_1, type_2, distance, only_preferred) for site in (site_1, site_2): sites = sites.filter( site.affected_by_mutations.any( and_(Mutation.in_sources(mutation_source), mutation_filter))) return sites
def most_mutated_sites(sources: List[MutationSource], site_type: SiteType = None, limit=25, intersection=True, exclusive=None, mutation_filter=None): """Sources must have the same value_type (counts/frequencies)""" assert not (intersection and exclusive) counts = prepare_for_summing(sources) query = (db.session.query( Site, *[count.label(f'count_{i}') for i, count in enumerate(counts)]).select_from(Mutation)) if intersection: for source in sources: query = query.join(source) else: for source in sources: query = query.outerjoin(source) if exclusive: query = query.filter(~Mutation.in_sources(*exclusive)) if mutation_filter is not None: query = query.filter(mutation_filter) query = (query.join(Mutation.affected_sites).filter( Site.protein.has(Protein.is_preferred_isoform))) if site_type: query = query.filter(SiteType.fuzzy_filter(site_type, join=True)) query = (query.group_by(Site).having(and_(*counts))) query = query.subquery() total_muts_count = reduce( operator.add, [getattr(query.c, f'count_{i}') for i in range(len(counts))]) total_muts_count = total_muts_count.label('mutations_count') query = (db.session.query( aliased(Site, query), total_muts_count, ).order_by(desc(total_muts_count))) return query.limit(limit)
def test_mutate(self): p = Protein(sequence='ABCDE') s = Site(protein=p, position=3, residue='C') cases = { 1: 'XBCDE', 3: 'ABXDE', } for position, expected_seq in cases.items(): m = Mutation(protein=p, position=position, alt='X') assert mutate_sequence(s, m, offset=2) == expected_seq
def test_show(self): p = Protein(refseq='NM_000123', sequence='TRAN', gene=Gene(name='TP53')) mutation = Mutation(protein=p, position=2, alt='K') db.session.add(mutation) response = self.client.get('/mutation/show/NM_000123/2/K') assert response.status_code == 200 assert b'TP53' in response.data assert b'NM_000123' in response.data
def test_impact_on_ptm(self): mutations = [Mutation(position=61)] protein = Protein(refseq='NM_00001', mutations=mutations) db.session.add(protein) protein.sites = [ Site(position=61), Site(position=54), Site(position=51) ] mutation = mutations[0] assert mutation.impact_on_ptm() == 'direct'
def get_confirmed_mutations(sources, only_preferred=True, genes=None, confirmed_by_definition=False, base_query=None): """ Utility to generate a query for retrieving confirmed mutations having specific mutation details. Args: sources: list of mutation details (sources) to be used to filter the mutations (including sources with non-confirmed mutations) only_preferred: include only mutations from preferred isoforms genes: limit to genes from provided list confirmed_by_definition: do not apply the expensive is_confirmed=True filter as all sources include only confirmed mutations base_query: the initial mutation query (allows to adjust selected columns) Returns: Query object yielding mutations. """ if not base_query: base_query = Mutation.query mutations = base_query def only_from_primary_isoforms(mutations_query): mutations_query = join_unique(mutations_query, Protein) return mutations_query.filter(Protein.is_preferred_isoform) if not confirmed_by_definition: mutations = mutations.filter_by(is_confirmed=True) # TODO: remove? mutations = only_from_primary_isoforms(mutations) if genes: mutations = mutations.filter( Protein.id.in_([g.preferred_isoform_id for g in genes])) selected_mutations = mutations.filter(Mutation.in_sources(*sources)) if only_preferred: selected_mutations = only_from_primary_isoforms(selected_mutations) return selected_mutations
def count_mutated_sites( site_types: Iterable[models.SiteType]=tuple(), model=None, only_primary=False, disordered=None, custom_filter=None ): filters = [ Mutation.protein_id == Protein.id, Site.protein_id == Protein.id, Mutation.precomputed_is_ptm ] for site_type in site_types: filters.append(models.SiteType.fuzzy_filter(site_type)) if custom_filter is not None: filters.append(custom_filter) if disordered is not None: filters.append(Site.in_disordered_region == disordered) query = ( db.session.query( func.count(distinct(case( [ ( ( Mutation.position.between( Site.position - 7, Site.position + 7 ) ), Site.id ) ], else_=literal_column('NULL') ))) ) .filter(and_(*filters)) .join(Mutation, Site.protein_id == Mutation.protein_id) ) if model: query = query.filter(Mutation.in_sources(model)) else: query = query.filter(Mutation.is_confirmed == True) if only_primary: query = query.join(Protein).filter(Protein.is_preferred_isoform) return query.scalar()
def mutation_by_source(combination, site_type=None, only_within_ptm_sites=False, only_primary=False): query = (Mutation.query.filter(Mutation.in_sources(*combination))) if only_within_ptm_sites: # query = query.filter(Mutation.is_ptm_distal == True) query = query.filter(Mutation.precomputed_is_ptm) if site_type: query = query.filter( Mutation.affected_sites.any(Site.types.contains(site_type))) if only_primary: query = query.join(Protein).filter(Protein.is_preferred_isoform) return query.count()
def count_mutations_from_genes(genes, sources, only_preferred_isoforms=False, strict=True): """Counts mutations and PTM mutations from isoforms from given set of genes. Args: genes: a list of Gene only_preferred_isoforms: should only one isoform per gene (the preferred/primary one) be used when filtering mutations? sources: a list of MutationDetails - only confirmed mutations from sources identified by given MutationDetail classes will be counted """ all_mutations_count = 0 ptm_mutations_count = 0 if strict: base_query = (db.session.query( Mutation.position, Mutation.alt, Protein.id).select_from(Mutation).join(Protein)) else: base_query = Mutation.query for gene in tqdm(genes): if only_preferred_isoforms: proteins = [gene.preferred_isoform] else: proteins = gene.isoforms mutations_filters = and_( Mutation.protein_id.in_([p.id for p in proteins]), Mutation.is_confirmed == True, Mutation.in_sources(*sources)) all_mutations_count += ( base_query.filter(mutations_filters).distinct().count()) ptm_mutations_count += (base_query.filter( and_(Mutation.precomputed_is_ptm, mutations_filters)).distinct().count()) print(all_mutations_count, ptm_mutations_count, ptm_mutations_count / all_mutations_count) return all_mutations_count, ptm_mutations_count
def test_browse_list(self): from miscellaneous import make_named_temp_file from test_imports.test_gene_list import raw_gene_list from imports.protein_data import active_driver_gene_lists as load_active_driver_gene_lists filename = make_named_temp_file(raw_gene_list) # create gene list and genes with self.app.app_context(): from imports.protein_data import ListData gene_lists = load_active_driver_gene_lists(lists=( ListData(name='TCGA', path=filename, mutations_source=TCGAMutation), )) db.session.add_all(gene_lists) # create preferred isoforms for genes for i, gene in enumerate(Gene.query.all()): # at least one mutation is required for gene on a gene list to be displayed mut = Mutation() MC3Mutation(mutation=mut) p = Protein(refseq='NM_000%s' % i, mutations=[mut]) gene.isoforms = [p] gene.preferred_isoform = p # check the static template response = self.client.get('/gene/list/TCGA') assert response.status_code == 200 assert b'TCGA' in response.data # check the dynamic data response = self.client.get('/gene/list_data/TCGA?order=asc') assert response.status_code == 200 gene_list = GeneList.query.filter_by(name='TCGA').one() # all results retrieved assert response.json['total'] == len(gene_list.entries) # properly sorted by fdr fdrs = [row['fdr'] for row in response.json['rows']] assert fdrs == sorted(fdrs)
def create_test_models(): protein = Protein(refseq='NM_0001', gene=Gene(name='SOMEGENE'), sequence='ABCD') mutation = Mutation(protein=protein, position=1, alt='E') MC3Mutation(mutation=mutation, cancer=Cancer(code='CAN'), samples='Some sample') InheritedMutation( mutation=mutation, clin_data=[ClinicalData(disease=Disease(name='Some disease'))]) protein_kinase = Protein(refseq='NM_0002', gene=Gene(name='OTHERGENE'), sequence='ABCD') kinase = Kinase(name='Kinase name', protein=protein_kinase) site = Site(protein=protein, position=1, residue='A', kinases=[kinase]) protein.sites = [site] return locals()
def __init__(self, protein, filter_manager, include_kinases_from_groups=False): super().__init__(protein, filter_manager, include_kinases_from_groups) sites, kinases, kinase_groups = self.get_sites_and_kinases( only_sites_with_kinases=False) tracks = prepare_tracks(protein, self.protein_mutations) source = filter_manager.get_value('Mutation.sources') source_model = Mutation.get_source_model(source) value_type = source_model.value_type parsed_mutations = self.represent_needles() self.json_data = { 'value_type': value_type, 'log_scale': (value_type == 'frequency'), 'mutations': parsed_mutations, 'sites': prepare_sites(sites), 'tracks': tracks }
def test_impact_on_specific_ptm(self): # case 0: there are no sites in the protein mutations = { Mutation(position=10): 'none', # too far away Mutation(position=9): 'none', Mutation(position=8): 'distal', Mutation(position=4): 'distal', Mutation(position=3): 'proximal', Mutation(position=2): 'proximal', Mutation(position=1): 'direct' } protein = Protein(refseq='NM_00001', mutations=mutations.keys()) db.session.add(protein) for mutation in mutations.keys(): assert mutation.impact_on_ptm() == 'none' # case 1: there are some sites in the protein protein.sites = [Site(position=1), Site(position=50)] site = protein.sites[0] for mutation, impact in mutations.items(): print(mutation) assert mutation.impact_on_ptm() == impact assert mutation.impact_on_specific_ptm(site) == impact # case 2: there are some sites but all will be excluded by a site filter def site_filter(sites): return [] for mutation in mutations.keys(): assert mutation.impact_on_ptm(site_filter=site_filter) == 'none'
def count_by_sources(sources: List[MutationSource]): return Mutation.query.filter(Mutation.in_sources(*sources)).count()
def confirmed_with_mimp(self): return Mutation.query.filter( and_( Mutation.in_sources(models.MIMPMutation), Mutation.is_confirmed, )).count()
def test_autocomplete_all(self): # MC3 GeneList is required as a target (a href for links) where users will be pointed # after clicking of cancer autocomplete suggestion. Likewise with the ClinVar list. db.session.add_all([ GeneList(name=name, mutation_source_name=detail_class.name) for name, detail_class in [ ('TCGA', MC3Mutation), ('ClinVar', InheritedMutation) ] ]) g = Gene(name='BR') p = Protein(id=1, refseq='NM_007', gene=g, sequence='XXXXXV') g.preferred_isoform = p # required for gene search to work - genes without preferred isoforms are ignored mut = Mutation(protein=p, position=6, alt='E') db.session.add_all([mut, p, g]) def autocomplete(query): r = self.client.get('/search/autocomplete_all/?q=' + query) self.visit_returned_urls(r) return r from database import bdb_refseq, bdb bdb_refseq['BR V6E'] = [p.id] # required for mutation search bdb.add_genomic_mut('1', 10000, 'T', 'C', mut) # Gene and mutations response = autocomplete('BR V6E') entry = get_entry_and_check_type(response, 'aminoacid mutation') assert entry response = autocomplete('BR V6') entry = get_entry_and_check_type(response, 'message') assert 'Awaiting for <code>{alt}</code>' in entry['name'] response = autocomplete('BR V') entry = get_entry_and_check_type(response, 'message') assert 'Awaiting for <code>{pos}{alt}</code>' in entry['name'] response = autocomplete('B') entry = get_entry_and_check_type(response, 'gene') assert 'BR' == entry['name'] # genomic mutation response = autocomplete('chr1 10000 T C') entry = get_entry_and_check_type(response, 'nucleotide mutation') assert entry and entry['input'] == 'CHR1 10000 T C' # is the search falling back to the other strand? response = autocomplete('chr1 10000 A G') entry = get_entry_and_check_type(response, 'nucleotide mutation') assert entry and entry['input'] == 'complement of CHR1 10000 A G' prompt = 'Awaiting for mutation in <code>{chrom} {pos} {ref} {alt}</code> format' for prompt_invoking_query in ['chr1', 'chr1 ', 'chr1 40', 'chr1 40 ', 'chr1 40 T']: response = autocomplete(prompt_invoking_query) entry = get_entry_and_check_type(response, 'message') assert entry['name'] == prompt # Pathways pathways = [ Pathway(description='Activation of RAS in B cells', reactome=1169092), Pathway(description='abortive mitotic cell cycle', gene_ontology=33277), Pathway(description='amacrine cell differentiation', gene_ontology=35881), Pathway(description='amniotic stem cell differentiation', gene_ontology=97086) ] db.session.add_all(pathways) # test partial matching and Reactome id pathways search for ras_activation_query in ['Activation', 'REAC:1', 'REAC:1169092']: response = autocomplete(ras_activation_query) entry = get_entry_and_check_type(response, 'pathway') assert entry['name'].startswith('Activation of RAS in B cells') # test Gene Ontology search: response = autocomplete('GO:33') go_pathway = get_entry_and_check_type(response, 'pathway') assert go_pathway['name'] == 'abortive mitotic cell cycle (GO:33277)' # check if multiple pathways are returned response = autocomplete('differentiation') assert len(response.json['entries']) == 2 # check if both genes an pathways are returned simultaneously # there should be: a pathway ('a>b<ortive...') and the >B<R gene response = autocomplete('b') entries = response.json['entries'] names = [entry['name'] for entry in entries] assert all([name in names for name in ['BR', 'abortive mitotic cell cycle']]) # check if "search more pathways" is displayed response = autocomplete('cell') # cell occurs in all four of added pathways; # as a limit of pathways shown is 3, we should get a "show more" link links = entries_with_type(response, 'see_more') assert len(links) == 1 assert links[0]['name'] == 'Show all pathways matching <i>cell</i>' # test case insensitive text search response = autocomplete('AMNIOTIC STEM') pathways = entries_with_type(response, 'pathway') assert len(pathways) == 1 assert pathways[0]['name'] == 'amniotic stem cell differentiation' # Disease disease_names = [ 'Cystic fibrosis', 'Polycystic kidney disease 2', 'Frontotemporal dementia', 'Cataract, nuclear total' ] diseases = {name: Disease(name=name) for name in disease_names} db.session.add_all(diseases.values()) response = autocomplete('cystic') cystic_matching = entries_with_type(response, 'disease') # both 'Cystic fibrosis' and PKD2 should match assert len(cystic_matching) == 2 # is comma containing disease name properly linked? response = autocomplete('Cataract') cataract = get_entry_and_check_type(response, 'disease') assert cataract['name'] == 'Cataract, nuclear total' # Gene mutation in disease # test suggestions response = autocomplete('cystic ') entry = entries_with_type(response, 'message')[0] assert re.match('Do you wish to search for (.*?) mutations\?', entry['name']) # currently there are no mutations associated with any disease # so the auto-completion should not return any results response = autocomplete('cystic in ') assert not response.json['entries'] # let's add a mutation m = Mutation(protein=p, position=1, alt='Y') bdb_refseq['BR X1Y'] = ['NM_007'] # note: sig_code is required here data = ClinicalData(disease=diseases['Cystic fibrosis'], sig_code=1) disease_mutation = InheritedMutation(mutation=m, clin_data=[data]) db.session.add_all([m, data, disease_mutation]) # should return '.. in BR' suggestion now. for query in ['cystic in', 'cystic in ']: response = autocomplete(query) result = get_entry_and_check_type(response, 'disease_in_protein') assert result['gene'] == 'BR' assert result['name'] == 'Cystic fibrosis' # both gene search and refseq search should yield the same, non-empty results results = [] for query in ['cystic in BR', 'cystic in NM_007', 'cystic in 007']: response = autocomplete(query) result = get_entry_and_check_type(response, 'disease_in_protein') results.append(result) assert all(r == result for r in results) and result
def source_specific_nucleotide_mappings(): from database import bdb from genomic_mappings import decode_csv from models import Mutation from tqdm import tqdm from gc import collect mutations = defaultdict(str) def count_mutations(mutations_query): for mutation in tqdm(mutations_query, total=mutations_query.count()): mutations[str(mutation[0]) + mutation[1] + str(mutation[2])] += i sources_map = {str(i): model for i, model in enumerate(mutation_sources().values())} print('Loading mutations from sources:') for i, model in tqdm(sources_map.items(), total=len(sources_map)): query = ( db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position) .filter(Mutation.in_sources(model)) # no need for '.filter(Mutation.is_confirmed==True)' # (if it is in source of interest, it is confirmed - we do not count MIMPs here) .yield_per(5000) ) count_mutations(query) # add merged i = str(len(sources_map)) sources_map[i] = 'merged' print('Loading merged mutations:') query = ( db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position) .filter(Mutation.is_confirmed == True) .yield_per(5000) ) count_mutations(query) print('Mutations loaded') collect() def iterate_known_muts_sources(): for value in tqdm(bdb.values(), total=len(bdb.db)): for item in map(decode_csv, value): sources = mutations.get(str(item['protein_id']) + item['alt'] + str(item['pos'])) if sources: yield sources counts = defaultdict(int) fields_ids = [source_id for source_id in sources_map.keys()] for sources in iterate_known_muts_sources(): for field in fields_ids: if field in sources: counts[field] += 1 return { 'Nucleotide mappings': { sources_map[key]: value for key, value in counts.items() } }
def test_mutation(self): s = Site(position=13, types={SiteType(name='methylation')}) p = Protein(refseq='NM_007', id=1, sites=[s], sequence='A' * 15, gene=Gene(name='SomeGene')) db.session.add(p) from database import bdb muts = {13: 14370, 15: 14376} for aa_pos, dna_pos in muts.items(): muts[aa_pos] = Mutation(protein=p, position=aa_pos, alt='V') bdb.add_genomic_mut('20', dna_pos, 'G', 'A', muts[aa_pos], is_ptm=True) query_url = '/chromosome/mutation/{chrom}/{pos}/{ref}/{alt}' # query as a novel mutation response = self.client.get( query_url.format(chrom='chr20', pos=14370, ref='G', alt='A')) assert response.status_code == 200 assert response.json == [{ 'alt': 'V', 'gene': 'SomeGene', 'in_datasets': {}, 'pos': 13, 'ptm_impact': 'direct', 'cnt_ptm': 1, 'closest_sites': ['13 A'], 'protein': 'NM_007', 'sites': [{ 'kinases': [], 'position': 13, 'residue': 'A', 'kinase_groups': [], 'type': 'methylation' }], 'ref': 'A' }] # well let's look on a known mutation: m = muts[15] mc3 = MC3Mutation(mutation=m, cancer=Cancer(name='Breast invasive carcinoma', code='BRCA'), count=1) esp = ExomeSequencingMutation(mutation=m, maf_all=0.02, maf_aa=0.02) db.session.add_all([m, mc3, esp]) db.session.commit() mutation_a15v_query = query_url.format(chrom='chr20', pos=14376, ref='G', alt='A') response = self.client.get(mutation_a15v_query) metadata = { 'MC3': { 'Cancers': [{ 'Cancer': 'Breast invasive carcinoma', 'Value': 1 }] }, 'ESP6500': { 'MAF': 0.02, 'MAF AA': 0.02, 'MAF EA': None } } assert response.json[0]['in_datasets'] == metadata expected_values = {'MC3': 1, 'ESP6500': 0.02} # if user does not want to download data for all datasets he may use: for source, meta in metadata.items(): response = self.client.get(mutation_a15v_query + '?filters=Mutation.sources:in:' + source) json = response.json[0] assert json['in_datasets'] == {source: meta} assert json['value'] == expected_values[source] response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:MC3;Mutation.mc3_cancer_code:in:BRCA' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:African American' ) assert response.json response = self.client.get( mutation_a15v_query + '?filters=Mutation.sources:in:ESP6500;Mutation.populations_ESP6500:in:European American' ) assert not response.json