def ds_ds2(): from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') ds2 = ds.copy() ds.samplesheet = ds.samplesheet.iloc[:2] ds2.samplesheet = ds2.samplesheet.iloc[2:] return (ds, ds2)
content: Test Dataset class. ''' # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script print('Instantiating Dataset') from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('Done!') print('Testing Dataset.__str__') assert (str(ds) == 'Dataset with 4 samples and 60721 features') print('Done!') print('Testing Dataset.__repr__') assert (ds.__repr__() == '<Dataset: 4 samples, 60721 features>') print('Done!') print('Testing Dataset.copy') assert (ds.copy() == ds) print('Done!') print('Testing Dataset.copy with modifications') dsp = ds.copy() dsp._counts.iloc[0, 0] = -5 assert (dsp != ds) print('Done!')
''' author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' import numpy as np # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') ds2 = ds.copy() ds.samplesheet = ds.samplesheet.iloc[:2] ds2.samplesheet = ds2.samplesheet.iloc[2:] print('Bootstrap') dsboot = ds.bootstrap() assert ('--sampling_' in dsboot.samplenames[0]) print('Done!') print('Test feature comparison (Mann-Whitney U)') pvals = ds.compare(ds2, method='mann-whitney') assert (np.isclose(pvals.values.min(), 0.193931)) print('Done!') print('Test feature comparison (Kolmogorov-Smirnov)') pvals = ds.compare(ds2, method='kolmogorov-smirnov')
'TNFRSF10B', 'EIF2A', 'DDIT3', 'PPP1R15A', 'ATF4', 'EDEM1', 'XBP1', 'ATF6', 'ATF3', 'ATG5', 'CASP3', 'CASP9', 'CASP4', 'CASP6', ] dsd = ds.copy() gids = [ ds.featuresheet.index[ds.featuresheet['GeneName'] == gname][0] for gname in gnames ] dsd.counts = dsd.counts.loc[gids] dsh = dsd.copy() if virus == 'dengue': dsh.query_samples_by_metadata('virus_reads_per_million > 1e3', inplace=True) dsh.query_samples_by_metadata('MOI != "0"', inplace=True) else: dsh.query_samples_by_metadata('virus_reads_per_million > 1e1', inplace=True) dsh.query_samples_by_metadata('MOI == "1"', inplace=True)
n = ds.samplesheet['numberDengueReads'].astype(int) ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n) ds.samplesheet['log_virus_reads_per_million'] = np.log10( 0.1 + ds.samplesheet['virus_reads_per_million']) #print('Log counts') #ds.counts.log(inplace=True) print('Get cell cycle genes (Core 67)') cc = load_cell_cycle_table()[[ 'GeneName', 'Periodic Rank', 'Phase', 'Core' ]] cc.query('Core != "No"', inplace=True) print('Hierarchical clustering of cells based on cell cycle and virus') dsn = ds.copy() dsn.counts = dsn.counts.loc[cc.index] for col in ['Periodic Rank', 'Phase']: dsn.featuresheet.loc[:, col] = cc.loc[dsn.featuresheet.index, col] dsn.rename(axis='features', column='GeneName', inplace=True) print('Log counts') dsn.counts.log(inplace=True) dsn.featuresheet.loc[:, 'Mean'] = dsn.counts.mean(axis=1) # Only keep decently expressed genes dsn.query_features_by_metadata('Mean > 1', inplace=True) hier = dsn.cluster.hierarchical( axis='samples', #phenotypes=['virus_reads_per_million'],
'tsne2_MOI1_10' ]: ds.samplesheet[col] = metadata_felix[col] ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn']) ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds']) cov = ds.samplesheet['coverage'] n = ds.samplesheet['numberDengueReads'].astype(int) ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n) ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + 1e6 * n / (cov + n)) vs = ds.samplesheet[['tsne1_MOI1_10', 'tsne2_MOI1_10']] # Restrict to high variance SNVs dsv = ds.copy() ind = ds.counts.values.var(axis=1).argsort()[-200:] dsv.counts = dsv.counts.iloc[ind] vsg = dsv.dimensionality.tsne(perplexity=30) # Cluster # NOTE: the number is manually chosen but does not matter much atm dsv.samplesheet['clusterN'] = ds.cluster.kmeans(axis='samples', n_clusters=6) dss = dsv.split('clusterN') # Plot gene expression tSNE overlayed with viral genomics fig, axs = plt.subplots(3, 3, figsize=(7, 6), sharex=True, sharey=True) axs = axs.ravel() plots = [ 'log_virus_reads_per_million', 'coverage', 'depth', 'numSNV', 'log_Dn',
# Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('Test feature selection by expression') res = ds.feature_selection.expressed(n_samples=1, exp_min=1) assert (res[0] == 'TSPAN6') print('Done!') print('Test feature selection by expression, in place') dsp = ds.copy() dsp.feature_selection.expressed(n_samples=1, exp_min=1, inplace=True) assert (dsp.featurenames[0] == 'TSPAN6') print('Done!') print('Test feature selection by overdispersed strata') res = ds.feature_selection.overdispersed_strata() assert (res[-1] == 'GLIPR2') print('Done!') print('Test feature selection by overdispersed strata, in place') dsp = ds.copy() dsp.feature_selection.overdispersed_strata(inplace=True) assert (dsp.featurenames[-1] == 'GLIPR2') print('Done!')
featuresheet='humanGC38', ) ds.query_samples_by_counts('total >= 50000', inplace=True) ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True) cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0) ds.counts.normalize('counts_per_million', inplace=True) ds.samplesheet['virus_reads_per_million'] = 0 for virus in ('dengue', 'zika'): ind = ds.samplesheet['virus'] == virus n = ds.samplesheet.loc[ind, 'number'+virus.capitalize()+'Reads'].astype(int) ds.samplesheet.loc[ind, 'virus_reads_per_million'] = 1e6 * n / (cov.loc[ind] + n) ds.counts.log(inplace=True) # Select only some cells for comparison dsc = ds.copy() dsc.samplesheet = dsc.samplesheet.query('500 < virus_reads_per_million') print('Get correlations') dsv = dsc.split(phenotypes='virus') vs = [] cos = [] for virus, dsvi in dsv.items(): co = dsvi.correlation.correlate_features_phenotypes( phenotypes='virus_reads_per_million', fillna=0).fillna(0) cos.append(co) vs.append(virus) cos = pd.concat(cos, axis=1) cos.columns = pd.Index(vs, name='virus')
from singlet.dataset import Dataset ds = Dataset(counts_table='example_PBMC') # Normalize ds.counts.normalize(method='counts_per_million', inplace=True) ds.counts.log(inplace=True) # Select features ds.feature_selection.expressed(n_samples=3, exp_min=1, inplace=True) ds.feature_selection.overdispersed_strata(n_features_per_stratum=20, inplace=True) # Reduce dimensionality vs = ds.dimensionality.tsne(n_dims=2, theta=0.5, perplexity=0.8) dsr = ds.copy() dsr.counts = vs.T # Cluster dsr.samplesheet['dbscan'] = dsr.cluster.dbscan(eps=5, axis='samples') dsr.samplesheet['kmeans'] = dsr.cluster.kmeans(n_clusters=7, axis='samples') # Plot t-SNE fig, axs = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, figsize=(8, 4)) dsr.plot.scatter_reduced_samples(vs, color_by='dbscan',