def count_by_sources(sources: List[MutationSource], site_type: SiteType, primary_isoforms=True, by_genes=True, genes=None, muts_conjunction=or_, **kwargs): base_query = Mutation.query.filter( muts_conjunction(*[Mutation.in_sources(source) for source in sources])) if primary_isoforms: base_query = base_query.join(Protein).filter( Protein.is_preferred_isoform) sites = Site.query.filter(Site.types.contains(site_type)) counter = MotifsCounter(site_type) if not by_genes: return counter.count_muts_and_sites(base_query, sites, **kwargs) counts_by_genes = {} if not genes: genes = Gene.query.all() for gene in tqdm(genes): query = base_query.filter(Mutation.protein == gene.preferred_isoform) gene_sites = sites.filter(Site.protein == gene.preferred_isoform) counts_by_genes[gene.name] = counter.count_muts_and_sites( query, gene_sites, show_progress=False, **kwargs) return counts_by_genes
def source_specific_mutated_sites(): muts_in_ptm_sites = {} mimp_muts = {} mutated_sites = defaultdict(dict) site_type_queries = [models.SiteType(name='')] # empty will match all sites site_type_queries.extend(models.SiteType.query) for name, model in mutation_sources().items(): count = ( Mutation.query .filter_by(is_confirmed=True, is_ptm_distal=True) .filter(Mutation.in_sources(model)) .count() ) muts_in_ptm_sites[name] = count mimp_muts[name] = ( Mutation.query .filter( and_( Mutation.in_sources(models.MIMPMutation, model), Mutation.is_confirmed, ) ).count() ) for site_type in tqdm(site_type_queries): mutated_sites[name][site_type] = count_mutated_sites([site_type], model) all_mutated_sites = {} for site_type in tqdm(site_type_queries): all_mutated_sites[site_type] = count_mutated_sites([site_type]) mutated_sites['merged'] = all_mutated_sites return { 'Mutations - in PTM sites': muts_in_ptm_sites, 'Mutations - with network-rewiring effect': mimp_muts, 'PTM sites affected by mutations': mutated_sites }
def get_genes_with_mutations_from_sources(sources, only_genes_with_ptm_sites=False): query = (db.session.query(Gene).join( Protein, Gene.preferred_isoform_id == Protein.id).join(Mutation)) query = query.filter(Mutation.in_sources(*sources)) genes = set(query.distinct()) if only_genes_with_ptm_sites: return {gene for gene in genes if gene.preferred_isoform.sites} return genes
def mutated_ptm_sites_in_proximity(mutation_source, type_1: str, type_2: str, mutation_filter=True, distance: int = 7, only_preferred=True) -> int: sites, (site_1, site_2) = ptm_sites_in_proximity(type_1, type_2, distance, only_preferred) for site in (site_1, site_2): sites = sites.filter( site.affected_by_mutations.any( and_(Mutation.in_sources(mutation_source), mutation_filter))) return sites
def most_mutated_sites(sources: List[MutationSource], site_type: SiteType = None, limit=25, intersection=True, exclusive=None, mutation_filter=None): """Sources must have the same value_type (counts/frequencies)""" assert not (intersection and exclusive) counts = prepare_for_summing(sources) query = (db.session.query( Site, *[count.label(f'count_{i}') for i, count in enumerate(counts)]).select_from(Mutation)) if intersection: for source in sources: query = query.join(source) else: for source in sources: query = query.outerjoin(source) if exclusive: query = query.filter(~Mutation.in_sources(*exclusive)) if mutation_filter is not None: query = query.filter(mutation_filter) query = (query.join(Mutation.affected_sites).filter( Site.protein.has(Protein.is_preferred_isoform))) if site_type: query = query.filter(SiteType.fuzzy_filter(site_type, join=True)) query = (query.group_by(Site).having(and_(*counts))) query = query.subquery() total_muts_count = reduce( operator.add, [getattr(query.c, f'count_{i}') for i in range(len(counts))]) total_muts_count = total_muts_count.label('mutations_count') query = (db.session.query( aliased(Site, query), total_muts_count, ).order_by(desc(total_muts_count))) return query.limit(limit)
def get_confirmed_mutations(sources, only_preferred=True, genes=None, confirmed_by_definition=False, base_query=None): """ Utility to generate a query for retrieving confirmed mutations having specific mutation details. Args: sources: list of mutation details (sources) to be used to filter the mutations (including sources with non-confirmed mutations) only_preferred: include only mutations from preferred isoforms genes: limit to genes from provided list confirmed_by_definition: do not apply the expensive is_confirmed=True filter as all sources include only confirmed mutations base_query: the initial mutation query (allows to adjust selected columns) Returns: Query object yielding mutations. """ if not base_query: base_query = Mutation.query mutations = base_query def only_from_primary_isoforms(mutations_query): mutations_query = join_unique(mutations_query, Protein) return mutations_query.filter(Protein.is_preferred_isoform) if not confirmed_by_definition: mutations = mutations.filter_by(is_confirmed=True) # TODO: remove? mutations = only_from_primary_isoforms(mutations) if genes: mutations = mutations.filter( Protein.id.in_([g.preferred_isoform_id for g in genes])) selected_mutations = mutations.filter(Mutation.in_sources(*sources)) if only_preferred: selected_mutations = only_from_primary_isoforms(selected_mutations) return selected_mutations
def count_mutated_sites( site_types: Iterable[models.SiteType]=tuple(), model=None, only_primary=False, disordered=None, custom_filter=None ): filters = [ Mutation.protein_id == Protein.id, Site.protein_id == Protein.id, Mutation.precomputed_is_ptm ] for site_type in site_types: filters.append(models.SiteType.fuzzy_filter(site_type)) if custom_filter is not None: filters.append(custom_filter) if disordered is not None: filters.append(Site.in_disordered_region == disordered) query = ( db.session.query( func.count(distinct(case( [ ( ( Mutation.position.between( Site.position - 7, Site.position + 7 ) ), Site.id ) ], else_=literal_column('NULL') ))) ) .filter(and_(*filters)) .join(Mutation, Site.protein_id == Mutation.protein_id) ) if model: query = query.filter(Mutation.in_sources(model)) else: query = query.filter(Mutation.is_confirmed == True) if only_primary: query = query.join(Protein).filter(Protein.is_preferred_isoform) return query.scalar()
def count_mutations_from_genes(genes, sources, only_preferred_isoforms=False, strict=True): """Counts mutations and PTM mutations from isoforms from given set of genes. Args: genes: a list of Gene only_preferred_isoforms: should only one isoform per gene (the preferred/primary one) be used when filtering mutations? sources: a list of MutationDetails - only confirmed mutations from sources identified by given MutationDetail classes will be counted """ all_mutations_count = 0 ptm_mutations_count = 0 if strict: base_query = (db.session.query( Mutation.position, Mutation.alt, Protein.id).select_from(Mutation).join(Protein)) else: base_query = Mutation.query for gene in tqdm(genes): if only_preferred_isoforms: proteins = [gene.preferred_isoform] else: proteins = gene.isoforms mutations_filters = and_( Mutation.protein_id.in_([p.id for p in proteins]), Mutation.is_confirmed == True, Mutation.in_sources(*sources)) all_mutations_count += ( base_query.filter(mutations_filters).distinct().count()) ptm_mutations_count += (base_query.filter( and_(Mutation.precomputed_is_ptm, mutations_filters)).distinct().count()) print(all_mutations_count, ptm_mutations_count, ptm_mutations_count / all_mutations_count) return all_mutations_count, ptm_mutations_count
def mutation_by_source(combination, site_type=None, only_within_ptm_sites=False, only_primary=False): query = (Mutation.query.filter(Mutation.in_sources(*combination))) if only_within_ptm_sites: # query = query.filter(Mutation.is_ptm_distal == True) query = query.filter(Mutation.precomputed_is_ptm) if site_type: query = query.filter( Mutation.affected_sites.any(Site.types.contains(site_type))) if only_primary: query = query.join(Protein).filter(Protein.is_preferred_isoform) return query.count()
def count_by_sources(sources: List[MutationSource]): return Mutation.query.filter(Mutation.in_sources(*sources)).count()
def confirmed_with_mimp(self): return Mutation.query.filter( and_( Mutation.in_sources(models.MIMPMutation), Mutation.is_confirmed, )).count()
def source_specific_nucleotide_mappings(): from database import bdb from genomic_mappings import decode_csv from models import Mutation from tqdm import tqdm from gc import collect mutations = defaultdict(str) def count_mutations(mutations_query): for mutation in tqdm(mutations_query, total=mutations_query.count()): mutations[str(mutation[0]) + mutation[1] + str(mutation[2])] += i sources_map = {str(i): model for i, model in enumerate(mutation_sources().values())} print('Loading mutations from sources:') for i, model in tqdm(sources_map.items(), total=len(sources_map)): query = ( db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position) .filter(Mutation.in_sources(model)) # no need for '.filter(Mutation.is_confirmed==True)' # (if it is in source of interest, it is confirmed - we do not count MIMPs here) .yield_per(5000) ) count_mutations(query) # add merged i = str(len(sources_map)) sources_map[i] = 'merged' print('Loading merged mutations:') query = ( db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position) .filter(Mutation.is_confirmed == True) .yield_per(5000) ) count_mutations(query) print('Mutations loaded') collect() def iterate_known_muts_sources(): for value in tqdm(bdb.values(), total=len(bdb.db)): for item in map(decode_csv, value): sources = mutations.get(str(item['protein_id']) + item['alt'] + str(item['pos'])) if sources: yield sources counts = defaultdict(int) fields_ids = [source_id for source_id in sources_map.keys()] for sources in iterate_known_muts_sources(): for field in fields_ids: if field in sources: counts[field] += 1 return { 'Nucleotide mappings': { sources_map[key]: value for key, value in counts.items() } }
def gather_muts_and_sites(self, mutations: BaseQuery, sites: BaseQuery, show_progress=True, occurrences_in: List[MutationSource] = None, intersection=None) -> MotifsData: """If occurrences_in is provided, the count of mutations will represent number of occurrences of mutations in provided sources, instead of number of distinct substitutions. """ if intersection: accepted_sites = sites.join(Mutation.affected_sites).filter( and_(*[Mutation.in_sources(source) for source in intersection])).all() else: accepted_sites = sites.all() mutations_affecting_sites = mutations.filter( Mutation.affected_sites.any(Site.types.contains(self.site_type))) muts_around_sites_with_motif = defaultdict(dict) muts_breaking_sites_motif = defaultdict(dict) sites_with_broken_motif = defaultdict(set) sites_with_motif = select_sites_with_motifs(accepted_sites, self.site_specific_motifs) if occurrences_in: def mutation_count(mut: Mutation): return sum([ mut.sources_map[source.name].get_value() if source.name in mut.sources_map else 0 for source in occurrences_in ]) else: def mutation_count(mut): return 1 is_affected = self.breaking_modes[self.mode] if show_progress: ptm_muts = mutations_affecting_sites.count() mutations_affecting_sites = tqdm(mutations_affecting_sites, total=ptm_muts) for mutation in mutations_affecting_sites: sites = mutation.affected_sites for site in sites: if site not in accepted_sites: continue for motif_name, motif in self.site_specific_motifs.items(): if site in sites_with_motif[motif_name]: count = mutation_count(mutation) muts_around_sites_with_motif[motif_name][ mutation] = count mutated_sequence = mutate_sequence(site, mutation, offset=7) if is_affected(mutated_sequence, motif): sites_with_broken_motif[motif_name].add(site) muts_breaking_sites_motif[motif_name][ mutation] = count return MotifsData( sites_with_motif=sites_with_motif, sites_with_broken_motif=sites_with_broken_motif, muts_around_sites_with_motif=muts_around_sites_with_motif, muts_breaking_sites_motif=muts_breaking_sites_motif)
def gather_ptm_muts_impacts(source: MutationSource, site_type: SiteType, limit_to_genes: List[str] = None, occurrences=True, limit_to_muts=False, muts_filter=None): """ Args: source: mutation source to gather mutations from site_type: PTM site type for which affecting mutations will be gathered limit_to_genes: list of gene names for which mutations of primary isoforms will be gathered occurrences: whether to count occurrences or distinct mutations limit_to_muts: list of tuples defining mutations and counts, like from AD data frame providing custom mutations lists overrides "occurrences" setting muts_filter: SQLAlchemy filter for mutations """ try: motifs_counter = MotifsCounter(site_type, mode='change_of_motif') except NoKnownMotifs as error: warn(f'Impacts collection failed, due to: {error}') return {} sites = (Site.query.filter(SiteType.fuzzy_filter( site_type, join=True)).join(Protein).filter(Protein.is_preferred_isoform)) def fuzzy_site_filter(sites): return [ site for site in sites # matches 'O-glycosylation' for site_type 'glycosylation' if any( site_type.name in type_name for type_name in site.types_names) ] mutations_by_impact_by_gene = { # order matters 'direct': defaultdict(int), 'motif-changing': defaultdict(int), 'proximal': defaultdict(int), 'distal': defaultdict(int) } mutations = (Mutation.query.filter( Mutation.in_sources(source)).join(Protein).join( Gene, Gene.preferred_isoform_id == Protein.id)) if muts_filter is not None: mutations = mutations.filter(muts_filter) motifs_data = motifs_counter.gather_muts_and_sites(mutations, sites, occurrences_in=[source]) all_breaking_muts = set() for motif_name, breaking_muts in motifs_data.muts_breaking_sites_motif.items( ): all_breaking_muts.update(breaking_muts) mutations = mutations.filter( Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type, join=True))) if limit_to_genes is not None: proteins_ids = (db.session.query( Protein.id).select_from(Gene).join(Gene.preferred_isoform).filter( Gene.name.in_(limit_to_genes)).all()) mutations = mutations.filter(Protein.id.in_(proteins_ids)) mutations = mutations.with_entities(Gene.name, Mutation) if limit_to_muts is not False: muts = { Mutation.query.filter_by(position=mut.position, alt=mut.mut_residue, protein=Protein.query.filter_by( refseq=mut.isoform).one()).one(): int(mut.count) for mut in limit_to_muts.itertuples(index=False) } for gene_name, mutation in tqdm(mutations, total=mutations.count()): if limit_to_muts is not False: if mutation not in muts: continue value = muts[mutation] else: value = mutation.sources_map[ source.name].get_value() if occurrences else 1 impact = mutation.impact_on_ptm(fuzzy_site_filter) if impact != 'direct' and mutation in all_breaking_muts: mutations_by_impact_by_gene['motif-changing'][gene_name] += value continue assert impact != 'none' mutations_by_impact_by_gene[impact][gene_name] += value return mutations_by_impact_by_gene