ds.counts.log(inplace=True) # Only select cells without virus ds.query_samples_by_metadata('virus_reads_per_million < 0.1', inplace=True) # Check table with number of cells table = (ds.samplesheet.groupby( ['time', 'MOI']).count().iloc[:, 0].unstack().fillna(0).astype(int).loc[[ '4', '12', '24', '48' ]]) print('Selecting only early 2 time points') # The rest has too few uninfected cells ds.query_samples_by_metadata('time in ["4", "12"]', inplace=True) dsm = ds.split('MOI') ks = dsm['0'].compare(dsm['1'])['P-value'] # Get the top hits for GO analysis hits = ds.featuresheet.loc[ks.nsmallest(100).index, 'GeneName'].values with open('../tables/bystander_top_100.tsv', 'w') as f: f.write('\n'.join(hits)) sys.exit() # Bonferroni correction ks = np.minimum(1, ks * len(ks)) # Print cumulative histogram of P-values x = ks.sort_values().values y = 1.0 - np.linspace(0, 1, len(x))
# Find the two clouds dso = Dataset( counts_table='dengue', samplesheet='dengue', featuresheet='humanGC38', ) dso.query_samples_by_name(ds.samplenames, inplace=True) dso.counts.normalize(inplace=True) dso.feature_selection.unique(inplace=True) dso.reindex(axis='features', column='GeneName', inplace=True, drop=False) dso.samplesheet['virus_reads_per_million'] = ds.samplesheet['virus_reads_per_million'] ind_2424 = afs.sel(sample=samplenames, position=2424).fillna(-1).data >= 0.1 ind_not2424 = afs.sel(sample=samplenames, position=2424).fillna(-1).data < 0.1 dso.samplesheet['is_2424'] = ind_2424 dsp = dso.split('is_2424') fig, ax = plt.subplots(figsize=(3.8, 3.2)) colors = {True: 'steelblue', False: 'darkred'} for key, dsi in dsp.items(): color = colors[key] if key is True: label = 'M2' else: label = 'M1' y = np.log10(0.1 + dsi.counts.loc['DDIT3'].values) x = np.log10(0.1 + np.random.normal(0, 0.1, size=len(y)) + dsi.samplesheet['virus_reads_per_million'].values) ax.scatter(x, y, s=10, color=color, alpha=0.15, label=label) ax.grid(True) ax.legend(loc='upper left', title='Mutant:') ax.set_xticks([-1, 1, 3, 5]) ax.set_yticks([-1, 1, 3, 5])
ds.rename(axis='features', column='GeneName', inplace=True) ds.feature_selection.unique(inplace=True) ds.counts.normalize(inplace=True) ## Restrict to high variance SNVs #dsv = ds.copy() #ind = ds.counts.values.var(axis=1).argsort()[-200:] #dsv.counts = dsv.counts.iloc[ind] # Find upregulated genes clusters = np.unique(ds.samplesheet['clusterN_SNV']) genes = {} for ic in clusters: ds.samplesheet['clusterN_SNV_{:}'.format( ic)] = ds.samplesheet['clusterN_SNV'] == ic dss = ds.split('clusterN_SNV_{:}'.format(ic)) comp = dss[True].compare(dss[False]) # FIXME: maybe look symmetrically for up- and downregulated comp['diff'] = dss[True].counts.mean(axis=1) - dss[False].counts.mean( axis=1) genesi = comp.loc[comp['diff'] > 0, 'P-value'].nsmallest(n=5).index.values genes[ic] = genesi genes_all = np.unique(np.concatenate(list(genes.values()))) with open('../data/genes_diff_expressed_clustersSNV.tsv', 'wt') as f: f.write('\t'.join(genes_all)) dsv = ds.query_features_by_name(genes_all)
sharex=True, sharey=True, figsize=(8, 4)) dsr.plot.scatter_reduced_samples(vs, color_by='dbscan', ax=axs[0], zorder=10) dsr.plot.scatter_reduced_samples(vs, color_by='kmeans', ax=axs[1], zorder=10) axs[0].set_title('DBSCAN') axs[1].set_title('K-means, 7 clusters') plt.tight_layout() ds.samplesheet['cluster'] = dsr.samplesheet['kmeans'] ds_dict = ds.split(phenotypes=['cluster']) genes_by_cluster = {} for key, dsi in ds_dict.items(): dso = ds.query_samples_by_metadata('cluster!=@key', local_dict=locals()) genes_by_cluster[key] = dsi.compare(dso)['P-value'].nsmallest(10).index assert (genes_by_cluster[1][:3].tolist() == [ 'ENSG00000138085', 'ENSG00000184076', 'ENSG00000116459' ]) plt.show()