Пример #1
0
def count_by_sources(sources: List[MutationSource],
                     site_type: SiteType,
                     primary_isoforms=True,
                     by_genes=True,
                     genes=None,
                     muts_conjunction=or_,
                     **kwargs):

    base_query = Mutation.query.filter(
        muts_conjunction(*[Mutation.in_sources(source) for source in sources]))

    if primary_isoforms:
        base_query = base_query.join(Protein).filter(
            Protein.is_preferred_isoform)

    sites = Site.query.filter(Site.types.contains(site_type))

    counter = MotifsCounter(site_type)

    if not by_genes:
        return counter.count_muts_and_sites(base_query, sites, **kwargs)

    counts_by_genes = {}

    if not genes:
        genes = Gene.query.all()

    for gene in tqdm(genes):

        query = base_query.filter(Mutation.protein == gene.preferred_isoform)
        gene_sites = sites.filter(Site.protein == gene.preferred_isoform)
        counts_by_genes[gene.name] = counter.count_muts_and_sites(
            query, gene_sites, show_progress=False, **kwargs)

    return counts_by_genes
Пример #2
0
def source_specific_mutated_sites():

    muts_in_ptm_sites = {}
    mimp_muts = {}
    mutated_sites = defaultdict(dict)

    site_type_queries = [models.SiteType(name='')]  # empty will match all sites
    site_type_queries.extend(models.SiteType.query)

    for name, model in mutation_sources().items():
        count = (
            Mutation.query
            .filter_by(is_confirmed=True, is_ptm_distal=True)
            .filter(Mutation.in_sources(model))
            .count()
        )
        muts_in_ptm_sites[name] = count

        mimp_muts[name] = (
            Mutation.query
            .filter(
                and_(
                    Mutation.in_sources(models.MIMPMutation, model),
                    Mutation.is_confirmed,
                )
            ).count()
        )

        for site_type in tqdm(site_type_queries):
            mutated_sites[name][site_type] = count_mutated_sites([site_type], model)

    all_mutated_sites = {}

    for site_type in tqdm(site_type_queries):
        all_mutated_sites[site_type] = count_mutated_sites([site_type])

    mutated_sites['merged'] = all_mutated_sites

    return {
        'Mutations - in PTM sites': muts_in_ptm_sites,
        'Mutations - with network-rewiring effect': mimp_muts,
        'PTM sites affected by mutations': mutated_sites
    }
Пример #3
0
def get_genes_with_mutations_from_sources(sources,
                                          only_genes_with_ptm_sites=False):
    query = (db.session.query(Gene).join(
        Protein, Gene.preferred_isoform_id == Protein.id).join(Mutation))
    query = query.filter(Mutation.in_sources(*sources))

    genes = set(query.distinct())

    if only_genes_with_ptm_sites:
        return {gene for gene in genes if gene.preferred_isoform.sites}
    return genes
Пример #4
0
def mutated_ptm_sites_in_proximity(mutation_source,
                                   type_1: str,
                                   type_2: str,
                                   mutation_filter=True,
                                   distance: int = 7,
                                   only_preferred=True) -> int:
    sites, (site_1, site_2) = ptm_sites_in_proximity(type_1, type_2, distance,
                                                     only_preferred)
    for site in (site_1, site_2):
        sites = sites.filter(
            site.affected_by_mutations.any(
                and_(Mutation.in_sources(mutation_source), mutation_filter)))
    return sites
Пример #5
0
def most_mutated_sites(sources: List[MutationSource],
                       site_type: SiteType = None,
                       limit=25,
                       intersection=True,
                       exclusive=None,
                       mutation_filter=None):
    """Sources must have the same value_type (counts/frequencies)"""

    assert not (intersection and exclusive)

    counts = prepare_for_summing(sources)

    query = (db.session.query(
        Site, *[count.label(f'count_{i}')
                for i, count in enumerate(counts)]).select_from(Mutation))

    if intersection:
        for source in sources:
            query = query.join(source)
    else:
        for source in sources:
            query = query.outerjoin(source)

        if exclusive:
            query = query.filter(~Mutation.in_sources(*exclusive))

    if mutation_filter is not None:
        query = query.filter(mutation_filter)

    query = (query.join(Mutation.affected_sites).filter(
        Site.protein.has(Protein.is_preferred_isoform)))

    if site_type:
        query = query.filter(SiteType.fuzzy_filter(site_type, join=True))

    query = (query.group_by(Site).having(and_(*counts)))

    query = query.subquery()

    total_muts_count = reduce(
        operator.add,
        [getattr(query.c, f'count_{i}') for i in range(len(counts))])

    total_muts_count = total_muts_count.label('mutations_count')

    query = (db.session.query(
        aliased(Site, query),
        total_muts_count,
    ).order_by(desc(total_muts_count)))

    return query.limit(limit)
Пример #6
0
def get_confirmed_mutations(sources,
                            only_preferred=True,
                            genes=None,
                            confirmed_by_definition=False,
                            base_query=None):
    """
    Utility to generate a query for retrieving confirmed mutations having specific mutation details.

    Args:
        sources: list of mutation details (sources) to be used to filter
            the mutations (including sources with non-confirmed mutations)
        only_preferred: include only mutations from preferred isoforms
        genes: limit to genes from provided list
        confirmed_by_definition: do not apply the expensive is_confirmed=True
            filter as all sources include only confirmed mutations
        base_query: the initial mutation query (allows to adjust selected columns)

    Returns:
        Query object yielding mutations.
    """

    if not base_query:
        base_query = Mutation.query

    mutations = base_query

    def only_from_primary_isoforms(mutations_query):

        mutations_query = join_unique(mutations_query, Protein)
        return mutations_query.filter(Protein.is_preferred_isoform)

    if not confirmed_by_definition:
        mutations = mutations.filter_by(is_confirmed=True)

    # TODO: remove?
    mutations = only_from_primary_isoforms(mutations)

    if genes:
        mutations = mutations.filter(
            Protein.id.in_([g.preferred_isoform_id for g in genes]))

    selected_mutations = mutations.filter(Mutation.in_sources(*sources))

    if only_preferred:
        selected_mutations = only_from_primary_isoforms(selected_mutations)

    return selected_mutations
Пример #7
0
def count_mutated_sites(
    site_types: Iterable[models.SiteType]=tuple(), model=None,
    only_primary=False, disordered=None, custom_filter=None
):
    filters = [
        Mutation.protein_id == Protein.id,
        Site.protein_id == Protein.id,
        Mutation.precomputed_is_ptm
    ]
    for site_type in site_types:
        filters.append(models.SiteType.fuzzy_filter(site_type))
    if custom_filter is not None:
        filters.append(custom_filter)
    if disordered is not None:
        filters.append(Site.in_disordered_region == disordered)
    query = (
        db.session.query(
            func.count(distinct(case(
                [
                    (
                        (
                            Mutation.position.between(
                                Site.position - 7,
                                Site.position + 7
                            )
                        ),
                        Site.id
                    )
                ],
                else_=literal_column('NULL')
            )))
        )
        .filter(and_(*filters))
        .join(Mutation, Site.protein_id == Mutation.protein_id)
    )
    if model:
        query = query.filter(Mutation.in_sources(model))
    else:
        query = query.filter(Mutation.is_confirmed == True)

    if only_primary:
        query = query.join(Protein).filter(Protein.is_preferred_isoform)

    return query.scalar()
Пример #8
0
def count_mutations_from_genes(genes,
                               sources,
                               only_preferred_isoforms=False,
                               strict=True):
    """Counts mutations and PTM mutations from isoforms from given set of genes.

    Args:
        genes: a list of Gene
        only_preferred_isoforms: should only one isoform per gene
            (the preferred/primary one) be used when filtering mutations?
        sources: a list of MutationDetails - only confirmed mutations from
            sources identified by given MutationDetail classes will be counted
    """
    all_mutations_count = 0
    ptm_mutations_count = 0

    if strict:
        base_query = (db.session.query(
            Mutation.position, Mutation.alt,
            Protein.id).select_from(Mutation).join(Protein))
    else:
        base_query = Mutation.query

    for gene in tqdm(genes):
        if only_preferred_isoforms:
            proteins = [gene.preferred_isoform]
        else:
            proteins = gene.isoforms

        mutations_filters = and_(
            Mutation.protein_id.in_([p.id for p in proteins]),
            Mutation.is_confirmed == True, Mutation.in_sources(*sources))

        all_mutations_count += (
            base_query.filter(mutations_filters).distinct().count())

        ptm_mutations_count += (base_query.filter(
            and_(Mutation.precomputed_is_ptm,
                 mutations_filters)).distinct().count())

    print(all_mutations_count, ptm_mutations_count,
          ptm_mutations_count / all_mutations_count)
    return all_mutations_count, ptm_mutations_count
Пример #9
0
def mutation_by_source(combination,
                       site_type=None,
                       only_within_ptm_sites=False,
                       only_primary=False):

    query = (Mutation.query.filter(Mutation.in_sources(*combination)))

    if only_within_ptm_sites:
        # query = query.filter(Mutation.is_ptm_distal == True)
        query = query.filter(Mutation.precomputed_is_ptm)

    if site_type:
        query = query.filter(
            Mutation.affected_sites.any(Site.types.contains(site_type)))

    if only_primary:
        query = query.join(Protein).filter(Protein.is_preferred_isoform)

    return query.count()
Пример #10
0
 def count_by_sources(sources: List[MutationSource]):
     return Mutation.query.filter(Mutation.in_sources(*sources)).count()
Пример #11
0
 def confirmed_with_mimp(self):
     return Mutation.query.filter(
         and_(
             Mutation.in_sources(models.MIMPMutation),
             Mutation.is_confirmed,
         )).count()
Пример #12
0
def source_specific_nucleotide_mappings():
    from database import bdb
    from genomic_mappings import decode_csv
    from models import Mutation
    from tqdm import tqdm
    from gc import collect

    mutations = defaultdict(str)

    def count_mutations(mutations_query):
        for mutation in tqdm(mutations_query, total=mutations_query.count()):
            mutations[str(mutation[0]) + mutation[1] + str(mutation[2])] += i

    sources_map = {str(i): model for i, model in enumerate(mutation_sources().values())}

    print('Loading mutations from sources:')
    for i, model in tqdm(sources_map.items(), total=len(sources_map)):
        query = (
            db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position)
            .filter(Mutation.in_sources(model))
            # no need for '.filter(Mutation.is_confirmed==True)'
            # (if it is in source of interest, it is confirmed - we do not count MIMPs here)
            .yield_per(5000)
        )
        count_mutations(query)

    # add merged
    i = str(len(sources_map))
    sources_map[i] = 'merged'
    print('Loading merged mutations:')

    query = (
        db.session.query(Mutation.protein_id, Mutation.alt, Mutation.position)
        .filter(Mutation.is_confirmed == True)
        .yield_per(5000)
    )
    count_mutations(query)

    print('Mutations loaded')
    collect()

    def iterate_known_muts_sources():
        for value in tqdm(bdb.values(), total=len(bdb.db)):
            for item in map(decode_csv, value):
                sources = mutations.get(str(item['protein_id']) + item['alt'] + str(item['pos']))
                if sources:
                    yield sources

    counts = defaultdict(int)
    fields_ids = [source_id for source_id in sources_map.keys()]

    for sources in iterate_known_muts_sources():
        for field in fields_ids:
            if field in sources:
                counts[field] += 1

    return {
        'Nucleotide mappings': {
            sources_map[key]: value
            for key, value in counts.items()
        }
    }
Пример #13
0
    def gather_muts_and_sites(self,
                              mutations: BaseQuery,
                              sites: BaseQuery,
                              show_progress=True,
                              occurrences_in: List[MutationSource] = None,
                              intersection=None) -> MotifsData:
        """If occurrences_in is provided, the count of mutations will
        represent number of occurrences of mutations in provided
        sources, instead of number of distinct substitutions.
        """

        if intersection:
            accepted_sites = sites.join(Mutation.affected_sites).filter(
                and_(*[Mutation.in_sources(source)
                       for source in intersection])).all()
        else:
            accepted_sites = sites.all()

        mutations_affecting_sites = mutations.filter(
            Mutation.affected_sites.any(Site.types.contains(self.site_type)))

        muts_around_sites_with_motif = defaultdict(dict)
        muts_breaking_sites_motif = defaultdict(dict)

        sites_with_broken_motif = defaultdict(set)

        sites_with_motif = select_sites_with_motifs(accepted_sites,
                                                    self.site_specific_motifs)

        if occurrences_in:

            def mutation_count(mut: Mutation):
                return sum([
                    mut.sources_map[source.name].get_value()
                    if source.name in mut.sources_map else 0
                    for source in occurrences_in
                ])
        else:

            def mutation_count(mut):
                return 1

        is_affected = self.breaking_modes[self.mode]

        if show_progress:
            ptm_muts = mutations_affecting_sites.count()
            mutations_affecting_sites = tqdm(mutations_affecting_sites,
                                             total=ptm_muts)

        for mutation in mutations_affecting_sites:
            sites = mutation.affected_sites

            for site in sites:
                if site not in accepted_sites:
                    continue

                for motif_name, motif in self.site_specific_motifs.items():
                    if site in sites_with_motif[motif_name]:
                        count = mutation_count(mutation)
                        muts_around_sites_with_motif[motif_name][
                            mutation] = count

                        mutated_sequence = mutate_sequence(site,
                                                           mutation,
                                                           offset=7)

                        if is_affected(mutated_sequence, motif):
                            sites_with_broken_motif[motif_name].add(site)
                            muts_breaking_sites_motif[motif_name][
                                mutation] = count

        return MotifsData(
            sites_with_motif=sites_with_motif,
            sites_with_broken_motif=sites_with_broken_motif,
            muts_around_sites_with_motif=muts_around_sites_with_motif,
            muts_breaking_sites_motif=muts_breaking_sites_motif)
Пример #14
0
def gather_ptm_muts_impacts(source: MutationSource,
                            site_type: SiteType,
                            limit_to_genes: List[str] = None,
                            occurrences=True,
                            limit_to_muts=False,
                            muts_filter=None):
    """

    Args:
        source: mutation source to gather mutations from
        site_type: PTM site type for which affecting mutations will be gathered
        limit_to_genes: list of gene names for which mutations of primary isoforms will be gathered
        occurrences: whether to count occurrences or distinct mutations
        limit_to_muts: list of tuples defining mutations and counts, like from AD data frame
            providing custom mutations lists overrides "occurrences" setting
        muts_filter: SQLAlchemy filter for mutations
    """

    try:
        motifs_counter = MotifsCounter(site_type, mode='change_of_motif')
    except NoKnownMotifs as error:
        warn(f'Impacts collection failed, due to: {error}')
        return {}

    sites = (Site.query.filter(SiteType.fuzzy_filter(
        site_type,
        join=True)).join(Protein).filter(Protein.is_preferred_isoform))

    def fuzzy_site_filter(sites):
        return [
            site for site in sites
            # matches 'O-glycosylation' for site_type 'glycosylation'
            if any(
                site_type.name in type_name for type_name in site.types_names)
        ]

    mutations_by_impact_by_gene = {
        # order matters
        'direct': defaultdict(int),
        'motif-changing': defaultdict(int),
        'proximal': defaultdict(int),
        'distal': defaultdict(int)
    }

    mutations = (Mutation.query.filter(
        Mutation.in_sources(source)).join(Protein).join(
            Gene, Gene.preferred_isoform_id == Protein.id))
    if muts_filter is not None:
        mutations = mutations.filter(muts_filter)

    motifs_data = motifs_counter.gather_muts_and_sites(mutations,
                                                       sites,
                                                       occurrences_in=[source])

    all_breaking_muts = set()
    for motif_name, breaking_muts in motifs_data.muts_breaking_sites_motif.items(
    ):
        all_breaking_muts.update(breaking_muts)

    mutations = mutations.filter(
        Mutation.affected_sites.any(SiteType.fuzzy_filter(site_type,
                                                          join=True)))
    if limit_to_genes is not None:
        proteins_ids = (db.session.query(
            Protein.id).select_from(Gene).join(Gene.preferred_isoform).filter(
                Gene.name.in_(limit_to_genes)).all())
        mutations = mutations.filter(Protein.id.in_(proteins_ids))

    mutations = mutations.with_entities(Gene.name, Mutation)

    if limit_to_muts is not False:
        muts = {
            Mutation.query.filter_by(position=mut.position,
                                     alt=mut.mut_residue,
                                     protein=Protein.query.filter_by(
                                         refseq=mut.isoform).one()).one():
            int(mut.count)
            for mut in limit_to_muts.itertuples(index=False)
        }

    for gene_name, mutation in tqdm(mutations, total=mutations.count()):

        if limit_to_muts is not False:
            if mutation not in muts:
                continue
            value = muts[mutation]
        else:
            value = mutation.sources_map[
                source.name].get_value() if occurrences else 1

        impact = mutation.impact_on_ptm(fuzzy_site_filter)
        if impact != 'direct' and mutation in all_breaking_muts:
            mutations_by_impact_by_gene['motif-changing'][gene_name] += value
            continue
        assert impact != 'none'
        mutations_by_impact_by_gene[impact][gene_name] += value

    return mutations_by_impact_by_gene