Exemplo n.º 1
0
def poly_aaa_vs_spidex(variants_by_gene):
    """Analysis of poly A track changing mutations using data from SPIDEX."""
    aaa_variants_list = all_poly_a_variants(variants_by_gene)
    raw_report = spidex_from_list(aaa_variants_list)

    print('Unique points', len(raw_report))
    print('Plotting')
    variants_groups = divide_variants_by_poly_aaa(raw_report)
    plot_aaa_vs_spidex(variants_groups)
    print('ks test')
    spidex_aaa_ks_test(variants_groups, already_divided=True)
Exemplo n.º 2
0
def get_data_from_ensembl_api(variants):
    import requests
    api_report = []

    for variant in all_poly_a_variants(variants,
                                       merge_variants_with_multiple_id=False):

        server = 'http://rest.ensembl.org'
        # server = 'http://grch37.rest.ensembl.org/' GRCH 37 has no eqtls implemented
        ext = '/eqtl/variant_name/homo_sapiens/%s?statistic=p-value;content-type=application/json' % variant.snp_id

        try:
            r = requests.get(server + ext,
                             headers={'content-type': 'application/json'})

            if not r.ok:
                r.raise_for_status()
                sys.exit()

            decoded = r.json()

            if 'error' not in decoded:
                print('Got data for %s' % variant.snp_id)
                # print(repr(decoded))
                for datum in decoded:
                    for transcript in variant.affected_transcripts:
                        for alt, aaa_data in transcript.poly_aaa.items():
                            report_chunk = (variant.snp_id, datum['tissue'],
                                            datum['value'], datum['gene'],
                                            aaa_data.increased,
                                            aaa_data.decreased,
                                            aaa_data.change, variant.chr_name,
                                            variant.chr_start, variant.ref,
                                            alt, transcript.strand,
                                            transcript.ensembl_id,
                                            transcript.cds_start,
                                            transcript.cds_end)
                            api_report += [report_chunk]

        except Exception as e:
            print(e)
    return api_report
Exemplo n.º 3
0
def poly_aaa_vs_expression(variants_by_gene):

    bdb = ExpressionDatabase(GTEX_DATABASE)

    def is_length_difference_big(l1, l2):
        """Is the first list much longer than the second?"""
        len1 = len(l1)
        len2 = len(l2)
        assert len1 > len2

        if len2 == 0 or len1 // len2 > 10:
            return True

    gtex_report = []
    gtex_report_with_tissue = []

    aaa_variants_list = list(all_poly_a_variants(variants_by_gene))

    print('Analysing %s poly_a related variants (out of %s total).' %
          (len(aaa_variants_list), len(variants_by_gene)))

    for variant in aaa_variants_list:

        for transcript in variant.affected_transcripts:

            if not transcript.poly_aaa:
                continue

            expression_data_by_alt = bdb.get_by_mutation(variant, transcript)

            transcript.expression = {}

            for alt, aaa_data in transcript.poly_aaa.items():

                expression_data = expression_data_by_alt.get(alt, None)

                if not expression_data:
                    continue
                else:
                    print('Expression data for', variant.snp_id, 'found:',
                          expression_data)

                expression_up = []
                expression_down = []

                data = transcript.poly_aaa[alt]

                for tissue_name, slope, gene in expression_data:
                    gtex_report_with_tissue.append(
                        (variant.snp_id, tissue_name, slope, gene,
                         data.increased, data.decreased, data.change,
                         variant.chr_name, variant.chr_start, variant.ref, alt,
                         transcript.strand, transcript.ensembl_id,
                         transcript.cds_start, transcript.cds_end))
                    slope = float(slope)
                    if slope > 0:
                        expression_up += [tissue_name]
                    elif slope < 0:
                        expression_down += [tissue_name]

                # is this rather up?
                if len(expression_up) > len(expression_down):
                    # is this certainly up?
                    if is_length_difference_big(expression_up,
                                                expression_down):
                        expression_trend = 'up'
                    else:
                        expression_trend = 'rather_up'
                # is this rather down?
                elif len(expression_down) > len(expression_up):
                    # is this certainly down?
                    if is_length_difference_big(expression_down,
                                                expression_up):
                        expression_trend = 'down'
                    else:
                        expression_trend = 'rather_down'
                # is unchanged?
                else:
                    expression_trend = 'constant'

                expression_up_in_x_cases = len(expression_up)
                expression_down_in_x_cases = len(expression_down)

                transcript.expression[alt] = expression_trend

                report_chunk = (variant.snp_id, expression_up_in_x_cases,
                                expression_down_in_x_cases, expression_trend,
                                data.increased, data.decreased, data.change,
                                variant.chr_name, variant.chr_start,
                                variant.ref, alt, transcript.strand,
                                transcript.ensembl_id, transcript.cds_start,
                                transcript.cds_end)
                gtex_report += [report_chunk]
        """
        gtex_report += [(
            sum('up' in v.expression.values() for v in poly_a_related_variants),
            sum('down' in v.expression.values() for v in poly_a_related_variants),
            sum(
                sum('up' == expr for expr in v.expression.values())
                for v in poly_a_related_variants
            ),
            sum(
                sum('down' == expr for expr in v.expression.values())
                for v in poly_a_related_variants
            ),
            sum(data.increased for v in poly_a_related_variants for data in v.poly_aaa.values()),
            sum(data.decreased for v in poly_a_related_variants for data in v.poly_aaa.values())
        )]
        """

    report('expression table for variants (based on data from gtex)',
           ['\t'.join(map(str, line)) for line in gtex_report], [
               'variant', 'expression+', 'expression-', 'trend', 'aaa+',
               'aaa-', 'aaa_change', 'chrom', 'pos', 'ref', 'alt', 'strand',
               'transcript', 'cds_start', 'cds_end'
           ])

    report(
        'expression table for variants with tissues (based on data from gtex)',
        ['\t'.join(map(str, line)) for line in gtex_report_with_tissue], [
            'variant', 'tissue', 'slope', 'gene', 'aaa+', 'aaa-', 'aaa_change',
            'chrom', 'pos', 'ref', 'alt', 'strand', 'transcript', 'cds_start',
            'cds_end'
        ])

    summarize_tissue_eqtl_aaa_correlation(gtex_report_with_tissue)

    #report(
    #    'Expression table for genes (based on data from GTEx)',
    #    ['\t'.join(map(str, line)) for line in gtex_report_by_genes],
    #    # note: alleles is not the same as variants
    #    [
    #        'gene', 'alleles with expression+', 'alleles with expression-',
    #        'variants with expression+', 'variants with expression-', '#aaa+', '#aaa-'
    #    ]
    #)

    print('Done')
Exemplo n.º 4
0
def spidex_aaa_ks_test(variants_groups, already_divided=False):

    if not already_divided:
        aaa_variants_list = all_poly_a_variants(variants_groups)
        raw_report = spidex_from_list(aaa_variants_list)
        variants_groups = divide_variants_by_poly_aaa(raw_report)

    groups_zscores = {
        name: [point['dpsi_zscore'] for point in group]
        for name, group in variants_groups.items()
    }

    for group_1, group_2 in combinations(groups_zscores, 2):
        print('%s vs %s:' % (group_1, group_2))
        z_scores_1 = groups_zscores[group_1]
        z_scores_2 = groups_zscores[group_2]
        ks_result = ks_2samp(z_scores_1, z_scores_2)
        print(ks_result)

    groups_new_aaa_lengths = defaultdict(list)

    for name, group in variants_groups.items():
        for point in group:
            new_aaa_length = point['new_aaa_length']
            groups_new_aaa_lengths[new_aaa_length].append(point['dpsi_zscore'])

    group = None
    name = None

    ks_results = {}

    for new_aaa_length in sorted(groups_new_aaa_lengths):
        print(
            'All mutations causing poly_aaa to be <= %s vs all mutations causing poly_aaa to be > %s:'
            % (new_aaa_length, new_aaa_length))
        z_scores_1 = [
            zscore for name, group in groups_new_aaa_lengths.items()
            for zscore in group if name <= new_aaa_length
        ]
        z_scores_2 = [
            zscore for name, group in groups_new_aaa_lengths.items()
            for zscore in group if name > new_aaa_length
        ]
        if not z_scores_2:
            print('No mutations causing poly_aaa to be > %s' % new_aaa_length)
            continue
        ks_result = ks_2samp(z_scores_1, z_scores_2)
        print(new_aaa_length, ks_result)
        ks_results[new_aaa_length] = -np.log(ks_result.pvalue)

    lengths = list(ks_results.keys())

    plt.hist(lengths,
             weights=list(ks_results.values()),
             bins=list(ks_results.keys()),
             rwidth=0.9)
    plt.xticks(lengths)

    plt.xlabel('Length of poly(A) track: $x$')
    plt.ylabel(r'$-\log($P-Value$)$')
    plt.title(
        'Ks-test for groups: '
        'mutations effecting in poly(A) length $\leq$ $x$ vs mutations effecting in poly(A) length > $x$'
    )
    plt.grid(True)
    save_plot(plt)
Exemplo n.º 5
0
def summarize_poly_aaa_variants(variants):

    columns = [
        'snp_id', 'gene', 'poly_aaa_increase', 'poly_aaa_decrease',
        'poly_aaa_change', 'chr', 'start', 'end', 'ref', 'alt', 'transcript',
        'cds_start', 'cds_end'
    ]

    Record = recordclass('RecordPolyA', columns)

    aaa_records = []
    aaa_variants = set()
    up_variants = {}
    down_variants = {}
    all_variants_ids = []
    variants_sources = Counter()
    transcripts = set()
    new_poly_a = 0
    in_poly_a = 0

    for variant in all_poly_a_variants(variants, preserve_sources=True):

        all_variants_ids.extend(variant.snp_id.split(','))

        new = False
        in_a = False

        for transcript in variant.affected_transcripts:

            if not transcript.poly_aaa:
                continue

            for alt, aaa_data in transcript.poly_aaa.items():

                record = Record(
                    variant.snp_id,
                    None,  #variant.ensembl_gene_stable_id    # TODO
                    aaa_data.increased,
                    aaa_data.decreased,
                    aaa_data.change,
                    variant.chr_name,
                    variant.chr_start,
                    variant.chr_end,
                    variant.ref,
                    alt,
                    transcript.ensembl_id,
                    transcript.cds_start,
                    transcript.cds_end)

                if not aaa_data.has and aaa_data.will_have:
                    new = True
                if aaa_data.has:
                    in_a = True

                if aaa_data.increased:
                    up_variants[variant] = True
                if aaa_data.decreased:
                    down_variants[variant] = True
                transcripts.add(transcript.ensembl_id)

                aaa_records.append(record)

            aaa_variants.add(variant)

        if new:
            new_poly_a += 1
        if in_a:
            in_poly_a += 1

        for source in set(variant.source.split(',')):
            variants_sources[source] += 1

    report('poly aaa increase and decrease by variants', aaa_records, columns)
    report('poly aaa sources', variants_sources.items(), ['source', 'count'])

    report('all ids', all_variants_ids)

    print('Variants creating new poly(A) tracks: %s' % new_poly_a)
    print('Variants in existing poly(A) tracks: %s' % in_poly_a)
    print('Affected transcripts: %s' % len(transcripts))
    print('Down variants: %s' % len(down_variants))
    print('Up variants: %s' % len(up_variants))
    print('Unique variants: %s' % len(aaa_variants))
    print('Variants identifiers: %s' %
          sum(v.snp_id.count(',') + 1 for v in aaa_variants))
    print(variants_sources)
Exemplo n.º 6
0
def poly_aaa_consequences(variants):

    mutations_in_cds_hgvs_format = defaultdict(list)
    indels = Counter()
    all = Counter()

    for variant in all_poly_a_variants(variants, preserve_sources=True):

        for transcript in variant.affected_transcripts:

            if not transcript.poly_aaa:
                continue

            for alt, aaa_data in transcript.poly_aaa.items():

                if aaa_data.increased:
                    category = 'increased'
                elif aaa_data.decreased:
                    category = 'decreased'
                else:
                    category = 'constant'

                hgvs = transcript.as_hgvs(variant.ref, alt)
                if 'del' in hgvs or 'ins' in hgvs:
                    indels[category] += 1
                all[category] += 1

                mutations_in_cds_hgvs_format[category].append(hgvs)
                mutations_in_cds_hgvs_format['all'].append(hgvs)

    print('Indels enrichment:')
    for category in indels:
        print(category, indels[category] / all[category] * 100, '%')

    for category, muts in mutations_in_cds_hgvs_format.items():
        report(
            'Mutations which result in ' + category + ' in cds hgvs formats',
            muts)

    consequences = defaultdict(Counter)
    skipped = Counter()
    for category, muts in mutations_in_cds_hgvs_format.items():
        filename = report(
            'Mutations which result in ' + category + ' in cds hgvs formats',
            muts)
        vep_filename = vep(filename)
        for line in open(vep_filename):
            if line.startswith('#'):
                continue
            line = line.split('\t')
            tested_transcript = line[0].split(':')[0]
            vep_transcript = line[4]
            if line[5] != 'Transcript':
                skipped['Not a transcript feature'] += 1
                continue
            if tested_transcript != vep_transcript:
                skipped['Different transcript'] += 1
                continue

            variant_consequences = line[6].split(',')
            for consequence in variant_consequences:
                consequences[category][consequence] += 1

        print(skipped)
        print('Raw consequences')
        print(consequences)

    graph = load_sequence_ontology()
    expanded_consequences = propagate_consequences(graph, consequences)

    for category, counts in expanded_consequences.items():

        consequences_to_include = ['coding_sequence_variant']
        consequences_to_include.extend(counts.keys())
        g = graph.subgraph([
            node for node, data in graph.nodes(data=True)
            if data['name'] in consequences_to_include
        ])
        g = g.reverse()

        max_count = max(counts.values())

        for node, data in g.nodes_iter(data=True):
            name = data['name']
            count = counts[name]
            color = (255 - int(log((count / max_count) + 1) * 255), 255, 255)
            g.node[node]['style'] = 'filled'
            g.node[node]['shape'] = 'box'
            color = '#%02x%02x%02x' % color
            g.node[node]['fillcolor'] = color
            if name not in consequences[category]:
                g.node[node]['style'] = 'dashed,filled'

        g = nx.relabel_nodes(
            g, {
                node: data['name'].replace('variant', 'v.') +
                ': %s' % counts.get(data['name'])
                for node, data in g.nodes(data=True)
            })

        a = nx_agraph.to_agraph(g)

        a.layout(
            'dot',
            args=
            '-Nfontsize=14 -Nwidth=".2" -Nheight=".2" -Nmargin=.1 -Gfontsize=8 -Earrowsize=.5'
        )
        a.draw('reports/poly_a_consequences_dag_' + category + '.svg')

    selected_consequences_groups = {
        'General coding':
        ['synonymous_variant', 'frameshift_variant', 'inframe_variant'],
        'Inframe': [
            'inframe_deletion', 'inframe_insertion', 'missense_variant',
            'stop_gained', 'stop_lost'
        ]
    }

    for group, selected_consequences in selected_consequences_groups.items():
        for category, counts in expanded_consequences.items():
            data = {
                consequence: counts[consequence]
                for consequence in selected_consequences
            }
            data = OrderedDict(sorted(data.items(), key=itemgetter(1)))

            # Create a pie chart
            wedges = plt.pie(
                list(data.values()),
                labels=list(data.keys()),
                shadow=False,
                colors=plt.cm.tab20(
                    numpy.linspace(1, 0, len(selected_consequences))),
                startangle=0,
                autopct='%1.1f%%',
            )
            for pie_wedge in wedges[0]:
                pie_wedge.set_edgecolor('black')

            # View the plot drop above
            plt.axis('equal')

            plt.title(group + ' consequences for variants causing ' +
                      category + ' in poly(A) length')
            plt.tight_layout()
            save_plot(plt, hide_title=True)