Exemplo n.º 1
0
    ds.counts.log(inplace=True)

    # Only select cells without virus
    ds.query_samples_by_metadata('virus_reads_per_million < 0.1', inplace=True)

    # Check table with number of cells
    table = (ds.samplesheet.groupby(
        ['time', 'MOI']).count().iloc[:,
                                      0].unstack().fillna(0).astype(int).loc[[
                                          '4', '12', '24', '48'
                                      ]])

    print('Selecting only early 2 time points')
    # The rest has too few uninfected cells
    ds.query_samples_by_metadata('time in ["4", "12"]', inplace=True)
    dsm = ds.split('MOI')
    ks = dsm['0'].compare(dsm['1'])['P-value']

    # Get the top hits for GO analysis
    hits = ds.featuresheet.loc[ks.nsmallest(100).index, 'GeneName'].values
    with open('../tables/bystander_top_100.tsv', 'w') as f:
        f.write('\n'.join(hits))

    sys.exit()

    # Bonferroni correction
    ks = np.minimum(1, ks * len(ks))

    # Print cumulative histogram of P-values
    x = ks.sort_values().values
    y = 1.0 - np.linspace(0, 1, len(x))
Exemplo n.º 2
0
    # Find the two clouds
    dso = Dataset(
            counts_table='dengue',
            samplesheet='dengue',
            featuresheet='humanGC38',
            )
    dso.query_samples_by_name(ds.samplenames, inplace=True)
    dso.counts.normalize(inplace=True)
    dso.feature_selection.unique(inplace=True)
    dso.reindex(axis='features', column='GeneName', inplace=True, drop=False)
    dso.samplesheet['virus_reads_per_million'] = ds.samplesheet['virus_reads_per_million']

    ind_2424 = afs.sel(sample=samplenames, position=2424).fillna(-1).data >= 0.1
    ind_not2424 = afs.sel(sample=samplenames, position=2424).fillna(-1).data < 0.1
    dso.samplesheet['is_2424'] = ind_2424
    dsp = dso.split('is_2424')
    fig, ax = plt.subplots(figsize=(3.8, 3.2))
    colors = {True: 'steelblue', False: 'darkred'}
    for key, dsi in dsp.items():
        color = colors[key]
        if key is True:
            label = 'M2'
        else:
            label = 'M1'
        y = np.log10(0.1 + dsi.counts.loc['DDIT3'].values)
        x = np.log10(0.1 + np.random.normal(0, 0.1, size=len(y)) + dsi.samplesheet['virus_reads_per_million'].values)
        ax.scatter(x, y, s=10, color=color, alpha=0.15, label=label)
    ax.grid(True)
    ax.legend(loc='upper left', title='Mutant:')
    ax.set_xticks([-1, 1, 3, 5])
    ax.set_yticks([-1, 1, 3, 5])
Exemplo n.º 3
0
    ds.rename(axis='features', column='GeneName', inplace=True)
    ds.feature_selection.unique(inplace=True)
    ds.counts.normalize(inplace=True)

    ## Restrict to high variance SNVs
    #dsv = ds.copy()
    #ind = ds.counts.values.var(axis=1).argsort()[-200:]
    #dsv.counts = dsv.counts.iloc[ind]

    # Find upregulated genes
    clusters = np.unique(ds.samplesheet['clusterN_SNV'])
    genes = {}
    for ic in clusters:
        ds.samplesheet['clusterN_SNV_{:}'.format(
            ic)] = ds.samplesheet['clusterN_SNV'] == ic
        dss = ds.split('clusterN_SNV_{:}'.format(ic))
        comp = dss[True].compare(dss[False])

        # FIXME: maybe look symmetrically for up- and downregulated
        comp['diff'] = dss[True].counts.mean(axis=1) - dss[False].counts.mean(
            axis=1)
        genesi = comp.loc[comp['diff'] > 0,
                          'P-value'].nsmallest(n=5).index.values

        genes[ic] = genesi

    genes_all = np.unique(np.concatenate(list(genes.values())))
    with open('../data/genes_diff_expressed_clustersSNV.tsv', 'wt') as f:
        f.write('\t'.join(genes_all))

    dsv = ds.query_features_by_name(genes_all)
Exemplo n.º 4
0
                            sharex=True,
                            sharey=True,
                            figsize=(8, 4))
    dsr.plot.scatter_reduced_samples(vs,
                                     color_by='dbscan',
                                     ax=axs[0],
                                     zorder=10)
    dsr.plot.scatter_reduced_samples(vs,
                                     color_by='kmeans',
                                     ax=axs[1],
                                     zorder=10)

    axs[0].set_title('DBSCAN')
    axs[1].set_title('K-means, 7 clusters')

    plt.tight_layout()

    ds.samplesheet['cluster'] = dsr.samplesheet['kmeans']
    ds_dict = ds.split(phenotypes=['cluster'])

    genes_by_cluster = {}
    for key, dsi in ds_dict.items():
        dso = ds.query_samples_by_metadata('cluster!=@key',
                                           local_dict=locals())
        genes_by_cluster[key] = dsi.compare(dso)['P-value'].nsmallest(10).index
    assert (genes_by_cluster[1][:3].tolist() == [
        'ENSG00000138085', 'ENSG00000184076', 'ENSG00000116459'
    ])

    plt.show()