# NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('Hierarchical clustering of samples') d = ds.cluster.hierarchical( 'samples', optimal_ordering=True) assert(tuple(d['leaves']) == ('second_sample', 'test_pipeline', 'first_sample', 'third_sample')) print('Done!') print('Hierarchical clustering of features') ds.counts = ds.counts.iloc[:200] d = ds.cluster.hierarchical( 'features', optimal_ordering=True) assert(tuple(d['leaves'])[:3] == ('PNPLA4', 'ITGAL', 'HOXA11')) print('Done!') print('Hierarchical clustering of features and phenotypes') ds.counts = ds.counts.iloc[:200] d = ds.cluster.hierarchical( axis='features', phenotypes=('quantitative_phenotype_1_[A.U.]',), optimal_ordering=True) assert(d['leaves'][23] == 'quantitative_phenotype_1_[A.U.]') print('Done!')
print('Add normalized virus counts') ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[ 'n_reads_virus'] / ds.samplesheet['n_reads'] ds.samplesheet['log_virus_reads_per_million'] = np.log10( 0.1 + ds.samplesheet['virus_reads_per_million']) print('Filter low-quality cells') n_reads_min = args.n_reads_min ds.query_samples_by_metadata('n_reads > @n_reads_min', local_dict=locals(), inplace=True) print('Limit to decently expressed genes') ind = (ds.counts > args.n_cpm_min_genes[0]).sum( axis=1) >= args.n_cpm_min_genes[1] ds.counts = ds.counts.loc[ind] print('Ignore genes with multiple IDs') from collections import Counter genec = Counter(ds.featuresheet['GeneName'].values) genes_multiple = [k for k, v in genec.items() if v > 1] ds.featuresheet = ds.featuresheet.loc[~ds.featuresheet['GeneName']. isin(genes_multiple)] print('Translate to gene names') ds.rename(axis='features', column='GeneName', inplace=True) print('Restrict to virus genes') dsv = ds.query_features_by_metadata('Organism == "mCMV"') dsv.query_samples_by_metadata("moi in ('low', 'high')", inplace=True)
import pandas as pd import xarray as xr import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' sys.path.append('/home/fabio/university/postdoc/singlet') from singlet.dataset import Dataset, CountsTable # Script if __name__ == '__main__': ds = Dataset(samplesheet='dengue', ) data = xr.open_dataset('../bigdata/allele_frequencies.nc') ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0)) # Sync with Felix metadata with open('../data/metadataD_SNV_with_tsne.pkl', 'rb') as ff: metadata_felix = pickle.load(ff) samples = metadata_felix.index[( ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)] ds.samplesheet = ds.samplesheet.loc[samples] metadata_felix = metadata_felix.loc[samples] for col in [ 'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10', 'tsne2_MOI1_10' ]: ds.samplesheet[col] = metadata_felix[col] ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn']) ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds'])