# In[30]: pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full') pca.fit( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose()) pca_coords = pca.transform( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose()) # Plot the pca # In[33]: functions.plot_pca(pca_coords, annotations, pca, labels=['celltype', 'Platform_Category', 'Dataset'], colour_dict=blood_atlas_colours) # In[7]: functions.plot_gene_platform_dependence_distribution(data, annotations, genes) # Make a graph of the threshold lowering process using the Kruskal Wallis H Test # In[ ]: functions.plot_KW_Htest(data, annotations, genes)
pca.fit( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 1.0]).transpose()) pca_coords = pca.transform( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 1.0]).transpose()) # In[7]: annotations['display_metadata'] = annotations.index functions.plot_pca(pca_coords, annotations, pca, labels=[ 'Cell Type', 'Sample Source', 'Platform_Category', 'Dataset', 'Tissue Type' ], colour_dict=colour_dict ) #, out_file='/Users/pwangel/Downloads/dc_atlas.html') # In[12]: pd.DataFrame(index=data.index, data=pca.components_[2], columns=['Loading' ]).to_csv("/Users/pwangel/Downloads/mystery_genes.tsv", sep='\t') # In[ ]: pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
genes = pd.read_csv('/Users/pwangel/Data/ensembl_hg38.91/gene_to_symbol_ensembl91_human.tsv', sep='\t', index_col=0, names=['symbol']) gene_list = np.intersect1d(genes.loc[np.intersect1d(data.index, genes.index)].symbol.values, genes_df.index.values) annotations['chip_id'] = [i.split(';')[0] for i in annotations.index.values.astype(str)] annotations = annotations.loc[(annotations.Platform_Category=='RNASeq') & (annotations.Dataset!='7275.0')] #This dataset is mistakenly in, it is annotated endoderm data = data[annotations.chip_id] data = functions.transform_to_percentile(data) # Run pca pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full') pca.fit(functions.transform_to_percentile(data.transpose())) pca_coords = pca.transform(data.transpose()) functions.plot_pca(pca_coords, annotations,pca, labels=['generic_sample_type', 'Platform_Category', 'Dataset'], colour_dict={}, \ pcs=[1,2,3], out_file='/Users/pwangel/PlotlyWorkspace/combine_data/naive_stemcells/RNASeq_only_pluripotent.html') #### Apply k means clustering to divide genes on/off state kmeans = sklearn.cluster.KMeans(n_clusters=2) #data_output = pd.DataFrame(index=gene_list, columns=['Bimodal val', 'Low_Expr', 'High_Expr', 'Low_Std', 'High_Std']) for i_gene in gene_list: ensembl_id = genes_df.loc[genes_df.index.values==i_gene].Ensembl kmeans.fit(data.loc[ensembl_id].values.reshape(-1, 1)) prediction = kmeans.predict(data.loc[ensembl_id].values.reshape(-1, 1)) delta_mean = data.loc[ensembl_id,prediction==0].values.mean()-data.loc[ensembl_id,prediction==1].values.mean() std_sum = data.loc[ensembl_id,prediction==0].values.std()+data.loc[ensembl_id,prediction==1].values.std()
pd.DataFrame(data=pca_coords, index=annotations.index, columns=['PCA' + str(i) for i in range(1, 11)]).to_csv( '/Users/pwangel/Downloads/blood_atlas_coordinates_v2.tsv', sep='\t') # Plot the pca # In[30]: annotations['display_metadata'] = annotations.index functions.plot_pca( pca_coords, annotations, pca, labels=[ 'Cell Type', 'Sample Source', 'Progenitor Type', 'Platform_Category' ], colour_dict=blood_atlas_colours, out_file='/Users/pwangel/Downloads/blood_atlas_with_ext_dc.html') # In[31]: myeloid_atlas_colours = pd.read_csv( '/Users/pwangel/Data/Metadata_dumps/imac_atlas_colours.tsv', sep='\t').set_index('Sample Source') myeloid_atlas_colours = { key: value[0] for key, value in zip(myeloid_atlas_colours.index.values, myeloid_atlas_colours.values) }
pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full') pca.fit( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose()) pca_coords = pca.transform( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose()) # Plot the pca # In[21]: functions.plot_pca( pca_coords, annotations, pca, labels=['celltype', 'Platform_Category', 'Dataset'], colour_dict=blood_atlas_colours, out_file='/Users/pwangel/Downloads/myeloid_atlas_ext_dc.html') # In[7]: functions.plot_gene_platform_dependence_distribution(data, annotations, genes) # Make a graph of the threshold lowering process using the Kruskal Wallis H Test # In[ ]: functions.plot_KW_Htest(data, annotations, genes)
i_sample].mean() # I generate a separate set of coordinates for the external data as I would like to project them by themselves. The way this works in that a list of data/annotations dataframes is passed to the plot function. The first set of data/annotations is the base and subsequent sets are projected on. The plot the pca is saved as a .html in the <out_file> location. # In[44]: pca_coords_ext = pca.transform( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.25][ ext_annotations.index]).transpose()) #First dataframes in the list of the base coordinates, following dataframes are projected on functions.plot_pca( [pca_coords, pca_coords_ext], [annotations, ext_annotations], pca, labels=['generic_sample_type', 'Platform_Category', 'Dataset'], colour_dict={}, pcs=[1, 2, 3], out_file='/Users/pwangel/Downloads/pluripotent_atlas_with_external.html') # Now try to 'zoom in' on the pluripotent cells (isolate them by applying k means clustering). This is a fairly rough way to identify the samples that are relevant to the 'naive' vs 'primed' analysis. I want stem cells only, no differentiated samples, this is best cleared up by the biological annotations but k means will do for now. # In[40]: kmeans = sklearn.cluster.KMeans(n_clusters=4).fit(pca_coords) annotations['K Means'] = kmeans.labels_ ext_annotations['K Means'] = annotations['K Means'].loc[ext_annotations.index] # Plot the PCA again but now with the kmeans clusters, so we can identify the biology of each cluster. # In[48]:
# In[122]: pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full') pca.fit( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose()) pca_coords = pca.transform( functions.transform_to_percentile( data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose()) # In[123]: functions.plot_pca(pca_coords, annotations, pca, labels=['cell_type', 'Dataset'] + list(nadias_annotations.keys()), colour_dict=blood_atlas_colours) # This section is showing microglia only # In[124]: for i_col in list(annotations.columns[31:38].values): annotations.loc[annotations.cell_type != 'microglia', i_col] = 'Unannotated' # In[125]: functions.plot_pca(pca_coords, annotations,