# In[30]:

pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose())
pca_coords = pca.transform(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose())

# Plot the pca

# In[33]:

functions.plot_pca(pca_coords,
                   annotations,
                   pca,
                   labels=['celltype', 'Platform_Category', 'Dataset'],
                   colour_dict=blood_atlas_colours)

# In[7]:

functions.plot_gene_platform_dependence_distribution(data, annotations, genes)

# Make a graph of the threshold lowering process using the Kruskal Wallis H Test

# In[ ]:

functions.plot_KW_Htest(data, annotations, genes)
Exemplo n.º 2
0
pca.fit(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 1.0]).transpose())
pca_coords = pca.transform(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 1.0]).transpose())

# In[7]:

annotations['display_metadata'] = annotations.index

functions.plot_pca(pca_coords,
                   annotations,
                   pca,
                   labels=[
                       'Cell Type', 'Sample Source', 'Platform_Category',
                       'Dataset', 'Tissue Type'
                   ],
                   colour_dict=colour_dict
                   )  #, out_file='/Users/pwangel/Downloads/dc_atlas.html')

# In[12]:

pd.DataFrame(index=data.index, data=pca.components_[2],
             columns=['Loading'
                      ]).to_csv("/Users/pwangel/Downloads/mystery_genes.tsv",
                                sep='\t')

# In[ ]:

pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
genes       = pd.read_csv('/Users/pwangel/Data/ensembl_hg38.91/gene_to_symbol_ensembl91_human.tsv', sep='\t', index_col=0, names=['symbol'])
gene_list   = np.intersect1d(genes.loc[np.intersect1d(data.index, genes.index)].symbol.values, genes_df.index.values)
annotations['chip_id'] = [i.split(';')[0] for i in annotations.index.values.astype(str)]
annotations = annotations.loc[(annotations.Platform_Category=='RNASeq') & (annotations.Dataset!='7275.0')] #This dataset is mistakenly in, it is annotated endoderm
data        = data[annotations.chip_id]

data = functions.transform_to_percentile(data)

# Run pca

pca        = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(functions.transform_to_percentile(data.transpose()))
pca_coords = pca.transform(data.transpose())

functions.plot_pca(pca_coords, annotations,pca, labels=['generic_sample_type', 'Platform_Category', 'Dataset'], colour_dict={}, \
                   pcs=[1,2,3], out_file='/Users/pwangel/PlotlyWorkspace/combine_data/naive_stemcells/RNASeq_only_pluripotent.html')

#### Apply k means clustering to divide genes on/off state

kmeans = sklearn.cluster.KMeans(n_clusters=2)
#data_output = pd.DataFrame(index=gene_list, columns=['Bimodal val', 'Low_Expr', 'High_Expr', 'Low_Std', 'High_Std'])

for i_gene in gene_list:

    ensembl_id = genes_df.loc[genes_df.index.values==i_gene].Ensembl

    kmeans.fit(data.loc[ensembl_id].values.reshape(-1, 1))
    prediction = kmeans.predict(data.loc[ensembl_id].values.reshape(-1, 1))

    delta_mean = data.loc[ensembl_id,prediction==0].values.mean()-data.loc[ensembl_id,prediction==1].values.mean()
    std_sum    = data.loc[ensembl_id,prediction==0].values.std()+data.loc[ensembl_id,prediction==1].values.std()
pd.DataFrame(data=pca_coords,
             index=annotations.index,
             columns=['PCA' + str(i) for i in range(1, 11)]).to_csv(
                 '/Users/pwangel/Downloads/blood_atlas_coordinates_v2.tsv',
                 sep='\t')

# Plot the pca

# In[30]:

annotations['display_metadata'] = annotations.index
functions.plot_pca(
    pca_coords,
    annotations,
    pca,
    labels=[
        'Cell Type', 'Sample Source', 'Progenitor Type', 'Platform_Category'
    ],
    colour_dict=blood_atlas_colours,
    out_file='/Users/pwangel/Downloads/blood_atlas_with_ext_dc.html')

# In[31]:

myeloid_atlas_colours = pd.read_csv(
    '/Users/pwangel/Data/Metadata_dumps/imac_atlas_colours.tsv',
    sep='\t').set_index('Sample Source')
myeloid_atlas_colours = {
    key: value[0]
    for key, value in zip(myeloid_atlas_colours.index.values,
                          myeloid_atlas_colours.values)
}
pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose())
pca_coords = pca.transform(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose())

# Plot the pca

# In[21]:

functions.plot_pca(
    pca_coords,
    annotations,
    pca,
    labels=['celltype', 'Platform_Category', 'Dataset'],
    colour_dict=blood_atlas_colours,
    out_file='/Users/pwangel/Downloads/myeloid_atlas_ext_dc.html')

# In[7]:

functions.plot_gene_platform_dependence_distribution(data, annotations, genes)

# Make a graph of the threshold lowering process using the Kruskal Wallis H Test

# In[ ]:

functions.plot_KW_Htest(data, annotations, genes)
                                          i_sample].mean()

# I generate a separate set of coordinates for the external data as I would like to project them by themselves. The way this works in that a list of data/annotations dataframes is passed to the plot function. The first set of data/annotations is the base and subsequent sets are projected on. The plot the pca is saved as a .html in the <out_file> location.

# In[44]:

pca_coords_ext = pca.transform(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.25][
            ext_annotations.index]).transpose())

#First dataframes in the list of the base coordinates, following dataframes are projected on
functions.plot_pca(
    [pca_coords, pca_coords_ext], [annotations, ext_annotations],
    pca,
    labels=['generic_sample_type', 'Platform_Category', 'Dataset'],
    colour_dict={},
    pcs=[1, 2, 3],
    out_file='/Users/pwangel/Downloads/pluripotent_atlas_with_external.html')

# Now try to 'zoom in' on the pluripotent cells (isolate them by applying k means clustering). This is a fairly rough way to identify the samples that are relevant to the 'naive' vs 'primed' analysis. I want stem cells only, no differentiated samples, this is best cleared up by the biological annotations but k means will do for now.

# In[40]:

kmeans = sklearn.cluster.KMeans(n_clusters=4).fit(pca_coords)
annotations['K Means'] = kmeans.labels_
ext_annotations['K Means'] = annotations['K Means'].loc[ext_annotations.index]

# Plot the PCA again but now with the kmeans clusters, so we can identify the biology of each cluster.

# In[48]:
# In[122]:

pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose())
pca_coords = pca.transform(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose())

# In[123]:

functions.plot_pca(pca_coords,
                   annotations,
                   pca,
                   labels=['cell_type', 'Dataset'] +
                   list(nadias_annotations.keys()),
                   colour_dict=blood_atlas_colours)

# This section is showing microglia only

# In[124]:

for i_col in list(annotations.columns[31:38].values):
    annotations.loc[annotations.cell_type != 'microglia',
                    i_col] = 'Unannotated'

# In[125]:

functions.plot_pca(pca_coords,
                   annotations,