from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

#Read in data, data will contain the gene expression file, annotations contains the metadata, genes contains gene metadata. It is annoying that it is spread over many files.. 

data           = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_expression_v7.1.tsv', sep='\t', index_col=0)
annotations    = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_samples_v7.1.tsv', sep='\t', index_col=0)
annotations_platform = pd.read_csv('/Users/pwangel/PlotlyWorkspace/combine_data/blood/outputs_for_front_end/iMac_annotations.tsv', sep='\t', index_col=0) ### Need to get the platform separately from this file
annotations    = annotations.merge(annotations_platform['Platform_Category'], how='inner', left_index=True, right_index=True)

genes_s4m      = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes_v7.1.tsv', sep='\t', index_col=0)
genes_varPart  = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t', index_col=0)
genes = genes_s4m.merge(genes_varPart['Platform_VarFraction'], how='left', left_index=True, right_index=True) # Want genes to have a platform variance fraction plus ensembl-gene symbol conversion

cut_data    = functions.transform_to_percentile(data.loc[genes.loc[genes.inclusion.values].index.values.astype(str), annotations.index.values])
all_ranked_data = functions.transform_to_percentile(data.loc[:, annotations.index.values])
annotations['Cluster'] = 1 #Dummy values for the categories of samples we will compare

#Generate the atlas PCA and use it to remove samples that are not part of the DE
pca        = sklearn.decomposition.PCA(n_components=3, svd_solver='full')
pca_coords = pca.fit_transform(cut_data.transpose())
annotations = annotations.loc[pca_coords[:,0] < 0]
all_ranked_data = all_ranked_data[annotations.index]
cut_data   = cut_data[annotations.index]

#Folder to output p values and graphs into.
fname      = 'macrophage_tissues'
folder     = 'macrophage_tissues'

cluster_names = []
Exemplo n.º 2
0
#data = data.merge(ext_data, how='inner', left_index=True, right_index=True)
#annotations = pd.concat([annotations, ext_annotations])

# In[3]:

weird_index = annotations.loc[
    (annotations['Platform Category'] == 'Illumina V4')
    & (annotations['Sample Source'] == 'in vivo')].index
annotations.loc[weird_index, 'Platform Category'] = 'Illumina V4 2'
annotations = annotations.loc[~np.in1d(annotations['Tissue Type'].
                                       values, ['skin', 'spleen'])]

# In[4]:

data = functions.transform_to_percentile(data[annotations.index])

# Only need to compute gene variance fraction if not done already, in the above we have already read a previously calculated version into the gene dataframe

# In[5]:

annotations.rename(columns={'Platform Category': 'Platform_Category'},
                   inplace=True)
genes = functions.calculate_platform_dependence(data, annotations)

# In[6]:

pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 1.0]).transpose())
#data        = pd.read_csv('/Users/pwangel/Downloads/pluripotent_atlas_data.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('/Users/pwangel/Downloads/pluripotent_RNASeq_annotations.tsv', sep='\t', index_col=0)
lizzi_anno  = pd.read_csv('/Users/pwangel/PlotlyWorkspace/combine_data/naive_stemcells/stemcell_annotations.tsv', sep='\t', index_col=0)
annotations = annotations.merge(lizzi_anno['LM_Group_COLOR'], how='left', left_index=True, right_index=True)
experiment_anno = pd.read_csv('/Users/pwangel/Downloads/RNASeq_only_pluripotent_annotations.tsv', sep='\t', index_col=0)
experiment_anno.index = [i+';'+j for i,j in zip(experiment_anno.chip_id.values.astype(str), experiment_anno.Dataset.values.astype(int).astype(str))]
annotations = annotations.merge(experiment_anno[['Experiment', 'Time', 'Initial Condition']], how='left', left_index=True, right_index=True)
#annotations.Dataset = annotations.Dataset.astype(float).astype(int).astype(str)

genes       = pd.read_csv('/Users/pwangel/Data/ensembl_hg38.91/gene_to_symbol_ensembl91_human.tsv', sep='\t', index_col=0, names=['symbol'])
gene_list   = np.intersect1d(genes.loc[np.intersect1d(data.index, genes.index)].symbol.values, genes_df.index.values)
annotations['chip_id'] = [i.split(';')[0] for i in annotations.index.values.astype(str)]
annotations = annotations.loc[(annotations.Platform_Category=='RNASeq') & (annotations.Dataset!='7275.0')] #This dataset is mistakenly in, it is annotated endoderm
data        = data[annotations.chip_id]

data = functions.transform_to_percentile(data)

# Run pca

pca        = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(functions.transform_to_percentile(data.transpose()))
pca_coords = pca.transform(data.transpose())

functions.plot_pca(pca_coords, annotations,pca, labels=['generic_sample_type', 'Platform_Category', 'Dataset'], colour_dict={}, \
                   pcs=[1,2,3], out_file='/Users/pwangel/PlotlyWorkspace/combine_data/naive_stemcells/RNASeq_only_pluripotent.html')

#### Apply k means clustering to divide genes on/off state

kmeans = sklearn.cluster.KMeans(n_clusters=2)
#data_output = pd.DataFrame(index=gene_list, columns=['Bimodal val', 'Low_Expr', 'High_Expr', 'Low_Std', 'High_Std'])
data = pd.read_csv(
    '/Users/pwangel/Downloads/myeloid_atlas_expression_v7.1.tsv',
    sep='\t',
    index_col=0)
annotations = pd.read_csv(
    '/Users/pwangel/PlotlyWorkspace/combine_data/blood/outputs_for_front_end/iMac_annotations.tsv',
    sep='\t',
    index_col=0)
genes = pd.read_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv',
                    sep='\t',
                    index_col=0)

# In[29]:

data = functions.transform_to_percentile(data)

# Only need to compute gene variance fraction if not done already, in the above we have already read a previously calculated version into the gene dataframe

# In[6]:

#genes = functions.calculate_platform_dependence(data, annotations)
#genes.to_csv('/Users/pwangel/Downloads/myeloid_atlas_genes.tsv', sep='\t')

# In[30]:

pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
pca.fit(
    functions.transform_to_percentile(
        data.loc[genes.Platform_VarFraction.values <= 0.2]).transpose())
pca_coords = pca.transform(
Exemplo n.º 5
0
#### This is an example script utilising the Mann Whitney Ranksum test implement in scipy.
#### The groups being tested here are the in vitro vs in vivo DC1 cells

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

data = pd.read_csv('/location/of/expression.tsv', sep='\t', index_col=0)
annotations = pd.read_csv('/location/of/annotations.tsv',
                          sep='\t',
                          index_col=0)
genes = pd.read_csv('/location/of/myeloid_atlas_genes_v7.1.tsv',
                    sep='\t',
                    index_col=0)

cut_data = functions.transform_to_percentile(data.loc[genes.inclusion.values])

# Select only DC1 samples

annotations = annotations.loc[annotations.tier1 ==
                              'DC1']  #### Select only DC1 cells for example
cut_data = cut_data[annotations.index.values]

pvals = np.array([])
delta_median = np.array([])

# Define dataframe to keep results in. Also keep the mean and std of each group for the hell of it

df_output = pd.DataFrame(index=gene_list,
                         columns=[
                             'P val', 'In vitro mean', 'In vivo mean',
sc_data = pd.read_csv(
    '/Users/pwangel/Data/Single_Cell/Han/aggregated_by_cluster_100_0pt0.tsv',
    sep='\t',
    index_col=0)
sc_annotations = pd.read_csv(
    '/Users/pwangel/Data/Single_Cell/Han/aggregated_by_cluster_metadata_100_0pt0.tsv',
    sep='\t',
    index_col=0)
sc_annotations['LM_Group_COLOR'] = sc_annotations.celltype.values

data = data.merge(sc_data, how='inner', left_index=True,
                  right_index=True).fillna(0.0)
annotations = pd.concat([annotations, sc_annotations])

#data = np.log2(1.e6*data/data.sum()+1)
data = functions.transform_to_percentile(data)

cut_data = data
all_ranked_data = data

#cut_data    = functions.transform_to_percentile(data.loc[genes.loc[genes.inclusion.values].index.values.astype(str), annotations.index.values])
#all_ranked_data = functions.transform_to_percentile(data.loc[:, annotations.index.values])

sel_samples = np.ones(shape=annotations.shape[0]).astype(bool)

annotations = annotations.loc[sel_samples]
cut_data = cut_data.loc[:, sel_samples]
cut_unfiltered_data = all_ranked_data.loc[:, sel_samples]

gmm = sklearn.mixture.GaussianMixture(n_components=2)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(annotations[[
        'Dataset', 'Platform_Category'
    ]].drop_duplicates().groupby('Platform_Category').size())

# In[67]:

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(annotations.groupby(['Platform_Category']).size())

# Now to actually make the atlas. First step in the atlas two step process: transform expression values to percentile values.

# In[34]:

data = functions.transform_to_percentile(data)

# Second step: model the influence of platform upon expression for each gene. As this can take a while, I often save the results and just read them in rather than recompute them. In this case the results are saved in 'pluripotent_atlas_genes_with_ext.tsv'.

# In[35]:

#genes = functions.calculate_platform_dependence(data, annotations)
#genes.to_csv('../data/pluripotent_atlas_genes_with_ext.tsv', sep='\t')
#genes = pd.read_csv('../data/pluripotent_atlas_genes.tsv', sep='\t')
genes = pd.read_csv('../data/pluripotent_atlas_genes_with_ext.tsv', sep='\t')

# Run the PCA on the expression data of the filtered, transformed genes. The value of the gene filter threshold is 0.25. I have not looked closely at this value. Perhaps a higher value would allow more components into the PCA.

# In[36]:

pca = sklearn.decomposition.PCA(n_components=10, svd_solver='full')
Exemplo n.º 8
0
    names=['symbol'])
genes_conversion = genes_conversion.loc[main_ensembl_ids]
genes = genes_s4m.merge(genes_conversion,
                        how='left',
                        left_index=True,
                        right_index=True)

annotations = annotations.loc[np.in1d(
    annotations.Dataset.values.astype(int),
    [7124, 7135, 7240, 7253])]  #, 6884, 7253])]
annotations = annotations.loc[np.in1d(annotations.LM_Group_COLOR,
                                      ['naive', 'primed'])]
data = data[
    annotations.chip_id]  #Not sure if the samples are in the right order

data = functions.transform_to_percentile(data)

#Loop through the sample types and genes to find differentially expressed genes
for i_gene in gene_list:

    ensembl_id = genes_conversion.index.values[genes_conversion.symbol.values
                                               == i_gene]

    fig = Figure()
    for i_dataset in annotations.Dataset.unique():
        for i_type, i_colour in zip(['naive', 'primed'], ['red', 'blue']):

            sel = (annotations.LM_Group_COLOR == i_type) & (annotations.Dataset
                                                            == i_dataset)
            fig.add_trace(
                Histogram(x=data.loc[ensembl_id, sel.values].values[0],
data = data.merge(ext_data, how='inner', left_index=True, right_index=True)
annotations = pd.concat([annotations, ext_annotations])


# In[49]:


print(annotations.shape)
print(data.shape)


# In[50]:


data = functions.transform_to_percentile(data)


# Only need to compute gene variance fraction if not done already, in the above we have already read a previously calculated version into the gene dataframe

# In[51]:


#genes = functions.calculate_platform_dependence(data, annotations)
#genes.to_csv('/Users/pwangel/Downloads/temp_ext_blood_atlas_genes.tsv', sep='\t') 
genes = pd.read_csv('/Users/pwangel/Downloads/temp_ext_blood_atlas_genes.tsv', sep='\t', index_col=0) 


# In[52]: