Exemplo n.º 1
0
def ds_ds2():
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    ds2 = ds.copy()
    ds.samplesheet = ds.samplesheet.iloc[:2]
    ds2.samplesheet = ds2.samplesheet.iloc[2:]
    return (ds, ds2)
Exemplo n.º 2
0
def ds():
    from singlet.dataset import Dataset
    dset = Dataset(
            samplesheet='example_sheet_tsv',
            counts_table='example_table_tsv')

    dset.counts.exclude_features(spikeins=True, other=True, inplace=True)
    return dset
Exemplo n.º 3
0
def get_dataset(tissue, membrane_only=True):
    counts = parse_counts(tissue)
    if membrane_only:
        go = parse_go_plasma_membrane()
        genes_membrane = go[go.isin(counts.index)]
        counts = counts.loc[genes_membrane]

    ds = Dataset(
        samplesheet=SampleSheet(cell_types),
        counts_table=CountsTable(counts),
    )
    return ds
Exemplo n.º 4
0
import pandas as pd
import xarray as xr
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
sys.path.append('/home/fabio/university/postdoc/singlet')
from singlet.dataset import Dataset, CountsTable

# Script
if __name__ == '__main__':

    ds = Dataset(
        samplesheet='dengue',
        counts_table='dengue',
        featuresheet='humanGC38',
    )
    data_snv = xr.open_dataset('../bigdata/allele_frequencies.nc')
    #ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0))

    # Sync with Felix metadata
    with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff:
        metadata_felix = pickle.load(ff)
    samples = metadata_felix.index[(
        ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)]
    ds.samplesheet = ds.samplesheet.loc[samples]
    metadata_felix = metadata_felix.loc[samples]
    for col in [
            'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10',
            'tsne2_MOI1_10', 'tsne1_SNV', 'tsne2_SNV', 'clusterN_SNV'
Exemplo n.º 5
0
#!/usr/bin/env python
# vim: fdm=indent
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
import numpy as np

# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    ds2 = ds.copy()
    ds.samplesheet = ds.samplesheet.iloc[:2]
    ds2.samplesheet = ds2.samplesheet.iloc[2:]

    print('Bootstrap')
    dsboot = ds.bootstrap()
    assert ('--sampling_' in dsboot.samplenames[0])
    print('Done!')

    print('Test feature comparison (Mann-Whitney U)')
    pvals = ds.compare(ds2, method='mann-whitney')
    assert (np.isclose(pvals.values.min(), 0.193931))
    print('Done!')

    print('Test feature comparison (Kolmogorov-Smirnov)')
Exemplo n.º 6
0
def get_dataset(tissue,
                membrane_only=True,
                regenerate=False,
                go_contains=None,
                go_exclude=None):

    # Some tissues like brain were split for sorting, we merge them here
    dss = []
    for tissue_facs in tissues_prediction[tissue]:
        cell_types, plates = parse_annotations(tissue_facs)
        counts = parse_counts(tissue_facs, regenerate=regenerate)
        if membrane_only:
            go = parse_go_plasma_membrane().index
            genes_membrane = go[go.isin(counts.index)]
            counts = counts.loc[genes_membrane]

        if (go_contains is not None) and (go_exclude is not None):
            raise ValueError('Use either go_contains or go_exclude')
        if go_contains is not None:
            go = parse_go_plasma_membrane()
            genes = go.index[go['GONames'].str.contains(go_contains)]
            genes = np.intersect1d(genes, counts.index)
            counts = counts.loc[genes]
        elif go_exclude is not None:
            go = parse_go_plasma_membrane()
            genes = go.index[~go['GONames'].str.contains(go_exclude)]
            genes = np.intersect1d(genes, counts.index)
            counts = counts.loc[genes]

        dss.append({'samplesheet': cell_types, 'counts': counts})

    if len(dss) == 1:
        ds = Dataset(
            samplesheet=SampleSheet(cell_types),
            counts_table=counts,
        )
        return ds
    else:
        # Merging is kind of messy because some genes are absent from either
        # subtissue (grrr); I put zeroes for now, Michelle is working on the
        # better solution (we have those numbers somewhere)
        genes = set()
        for ds in dss:
            genes |= set(ds['counts'].index.values)
        genes = pd.Index(sorted(genes), name=ds['counts'].index.name)
        for ds in dss:
            genes_missing = genes[~genes.isin(ds['counts'].index)]
            for gene in genes_missing:
                # The stuff is normalized, pseudocounted, and logged
                ds['counts'].loc[gene] = -1.0
            ds['counts'] = ds['counts'].loc[genes]
        ngenes = len(genes)
        ncells = sum(ds['samplesheet'].shape[0] for ds in dss)
        samplesheet_all = pd.concat([ds['samplesheet'] for ds in dss], axis=0)
        counts_all = pd.DataFrame(np.zeros((ngenes, ncells), float),
                                  index=genes,
                                  columns=samplesheet_all.index)
        for ds in dss:
            counts_all.loc[:,
                           ds['counts'].columns.values] = ds['counts'].values
        counts_all = CountsTable(counts_all)
        if ds['counts']._normalized:
            counts_all._normalized = ds['counts']._normalized

        ds = Dataset(
            samplesheet=SampleSheet(samplesheet_all),
            counts_table=counts_all,
        )
        return ds
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
sys.path.append('/home/fabio/university/postdoc/singlet')
from singlet.dataset import Dataset


# Script
if __name__ == '__main__':

    ds = Dataset(
            samplesheet='dengue',
            counts_table='dengue',
            featuresheet='humanGC38',
            )

    # Sync with Felix metadata
    with open('../data/metadataD_SNV_with_tsne.pkl', 'rb') as ff:
        metadata_felix = pickle.load(ff)
    samples = metadata_felix.index[(~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)]
    ds.samplesheet = ds.samplesheet.loc[samples]
    metadata_felix = metadata_felix.loc[samples]
    for col in ['coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10', 'tsne2_MOI1_10']:
        ds.samplesheet[col] = metadata_felix[col]
    ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn'])
    ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds'])

Exemplo n.º 8
0
    tb.rename(columns={
        'Symbol': 'GeneName',
        'Phase ': 'Phase',
        'Core 67': 'Core',
    },
              inplace=True)
    return tb


# Script
if __name__ == '__main__':

    print('Load dataset')
    ds = Dataset(
        counts_table='dengue',
        samplesheet='virus',
        featuresheet='humanGC38',
    )
    ds.query_samples_by_counts('total >= 50000', inplace=True)
    ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True)
    ds.samplesheet.loc[:, 'time'] = pd.Categorical(
        ds.samplesheet.loc[:, 'time'].astype(int), )
    cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0)

    print('Normalize')
    ds.counts.normalize('counts_per_million', inplace=True)

    print('Add virus reads')
    n = ds.samplesheet['numberDengueReads'].astype(int)
    ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n)
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
Exemplo n.º 9
0
if __name__ == '__main__':

    print('Load RNA velocity results (loom file)')
    # NOTE: there is a mess trying to connect the velocyto output with the normal
    # htseq-count output, below is the explanation how I solved this.
    # velocyto skips quasi-empty BAM files, final order is lexicographic as in:
    # echo '' > ~/subfolders.tsv; for fdn in 10017006*; do si=$(du $fdn/star/Aligned.out.possorted.bam | cut -f1); if [ $si -ge "30" ]; then echo $fdn >> ~/subfolders.tsv; fi; done
    cellnames = pd.read_csv('../bigdata/rnavelocity_cellnames.tsv', header=None).values[:, 0]
    vlm = vcy.VelocytoLoom("../bigdata/rna_velocity.loom")
    vlm.ca['CellID'] = cellnames

    print('Load normal counts and metadata and sync them with the velocity results')
    # Load and sync external metadata
    ds = Dataset(
            #counts_table='dengue',
            samplesheet='dengue',
            )
    with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff:
        metadata_felix = pickle.load(ff)
    ds.samplesheet = ds.samplesheet.loc[cellnames]
    metadata_felix = metadata_felix.loc[cellnames]

    vlm.ca['ClusterName'] = metadata_felix['clusterN_SNV'].fillna(6).values
    vlm.set_clusters(vlm.ca["ClusterName"])

    ds.samplesheet['clusterN_SNV'] = metadata_felix['clusterN_SNV'].fillna(6)
    ds.samplesheet['coverage'] = metadata_felix['coverage']
    ds.samplesheet['virus_reads_per_million'] = 1.0 * 1e6 * ds.samplesheet['numberDengueReads'] / (ds.samplesheet['numberDengueReads'] + ds.samplesheet['coverage'])
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Filter cells, genes, etc. using the velocity tutorial')
Exemplo n.º 10
0
    pa.add_argument('--save',
                    action='store_true',
                    help='Store filtered cells dataframe of counts to file')
    pa.add_argument('--n-reads-min',
                    type=int,
                    default=15000,
                    help='Minimal number of reads for good cells')
    pa.add_argument('--keep2',
                    action='store_true',
                    help='Keep sample 2-uninfected despite low quality')
    args = pa.parse_args()

    print('Load dataset')
    ds = Dataset(
        counts_table='combined',
        featuresheet='combined',
        samplesheet='combined',
    )

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)

    if not args.keep2:
        print('Filter out sample 2-uninfected (low quality)')
        ds.query_samples_by_metadata('biosample != "2-uninfected"',
                                     inplace=True)

    print('Add normalized virus counts')
Exemplo n.º 11
0
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
import numpy as np
import scipy as sp


# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv')

    print('Hierarchical clustering of samples')
    d = ds.cluster.hierarchical(
            'samples',
            optimal_ordering=True)
    assert(tuple(d['leaves']) == ('second_sample', 'test_pipeline',
                                  'first_sample', 'third_sample'))
    print('Done!')

    print('Hierarchical clustering of features')
    ds.counts = ds.counts.iloc[:200]
    d = ds.cluster.hierarchical(
            'features',
            optimal_ordering=True)
    assert(tuple(d['leaves'])[:3] == ('PNPLA4', 'ITGAL', 'HOXA11'))
Exemplo n.º 12
0
# Script
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--virus',
                        choices=['dengue', 'Zika'],
                        default='dengue',
                        help='Virus to look at')
    args = parser.parse_args()
    virus = args.virus

    print('Load dataset')
    ds = Dataset(
        counts_table=virus.lower(),
        samplesheet='virus',
        featuresheet='humanGC38',
    )
    ds.query_samples_by_counts('total >= 50000', inplace=True)

    ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True)
    cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0)

    print('Normalize')
    ds.counts.normalize('counts_per_million', inplace=True)

    print('Add virus reads')
    n = ds.samplesheet['number{:}Reads'.format(virus.capitalize())].astype(int)
    ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n)

    print('Log counts')
Exemplo n.º 13
0
        ))

    print('Read sample metadata')
    fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv'
    samplesheet = SampleSheet(
        pd.read_csv(
            fn,
            sep='\t',
            index_col=0,
            dtype={0: str},
        ))

    print('Build dataset')
    ds = Dataset(
        counts_table=counts,
        featuresheet=featuresheet,
        samplesheet=samplesheet,
    )

    print('Add normalized virus counts')
    ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[
        'n_reads_virus'] / ds.samplesheet['n_reads']
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)
Exemplo n.º 14
0
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
sys.path.append('/home/fabio/university/postdoc/singlet')
from singlet.dataset import Dataset


# Script
if __name__ == '__main__':

    ds = Dataset(
            counts_table='dengue',
            samplesheet='dengue',
            featuresheet='humanGC38',
            )
    ds.query_samples_by_counts('total >= 50000', inplace=True)

    ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True)
    cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0)
    ds.counts.normalize('counts_per_million', inplace=True)
    ds.samplesheet['virus_reads_per_million'] = 0
    for virus in ('dengue', 'zika'):
        ind = ds.samplesheet['virus'] == virus
        n = ds.samplesheet.loc[ind, 'number'+virus.capitalize()+'Reads'].astype(int)
        ds.samplesheet.loc[ind, 'virus_reads_per_million'] = 1e6 * n / (cov.loc[ind] + n)
    ds.counts.log(inplace=True)

    # Select only some cells for comparison
Exemplo n.º 15
0
def ds():
    from singlet.dataset import Dataset
    return Dataset(counts_table='example_PBMC')
Exemplo n.º 16
0
    def plot_qcs(ds, virus_threshold=60):
        fig, axs = plt.subplots(2, 3, figsize=(12, 7))
        axs = axs.ravel()

        # Number of reads
        ax = axs[0]
        col = 'n_reads'
        col_label = 'Number of reads'
        plot_cumulative(0.1 + ds.samplesheet[col],
                        label='all cells',
                        color='k',
                        ax=ax)
        for sn, datum in ds.samplesheet[[col,
                                         'biosample']].groupby('biosample'):
            x = 0.1 + datum[col]
            plot_cumulative(x, label=sn, ax=ax)
        ax.grid(True)
        ax.set_xlabel(col_label)
        ax.set_xlim(xmin=0.9 * ds.samplesheet[col].min())
        ax.set_xscale('log')

        # Number of genes
        ax = axs[1]
        col = 'n_genes_1+'
        col_label = 'Number of Genes (1+)'
        plot_cumulative(0.1 + ds.samplesheet[col],
                        label='all cells: {:}'.format(ds.samplesheet.shape[0]),
                        color='k',
                        ax=ax)
        for sn, datum in ds.samplesheet[[col,
                                         'biosample']].groupby('biosample'):
            x = 0.1 + datum[col]
            plot_cumulative(x, label='{:}: {:}'.format(sn, len(x)), ax=ax)
        ax.grid(True)
        ax.set_xlabel(col_label)
        ax.set_xlim(xmin=0.9 * ds.samplesheet[col].min())
        ax.set_xscale('log')
        ax.legend(loc='lower left', fontsize=8)

        # Number of genes
        ax = axs[2]
        col = 'n_genes_3+'
        col_label = 'Number of Genes (3+)'
        plot_cumulative(0.1 + ds.samplesheet[col],
                        label='all cells',
                        color='k',
                        ax=ax)
        for sn, datum in ds.samplesheet[[col,
                                         'biosample']].groupby('biosample'):
            x = 0.1 + datum[col]
            plot_cumulative(x, label=sn, ax=ax)
        ax.grid(True)
        ax.set_xlabel(col_label)
        ax.set_xlim(xmin=0.9 * ds.samplesheet[col].min())
        ax.set_xscale('log')

        # Number of virus reads
        ax = axs[3]
        col = 'n_reads_virus'
        col_label = 'Number of virus reads'
        plot_cumulative(0.1 + ds.samplesheet[col],
                        label='all cells',
                        color='k',
                        ax=ax)
        for sn, datum in ds.samplesheet[[col,
                                         'biosample']].groupby('biosample'):
            x = 0.1 + datum[col]
            plot_cumulative(x, label=sn, ax=ax)
        ax.plot([virus_threshold] * 2, [0, 1],
                lw=1.5,
                color='k',
                alpha=0.7,
                ls='--')
        ax.grid(True)
        ax.set_xlabel(col_label)
        ax.set_xlim(xmin=0.09)
        ax.set_xscale('log')

        # Housekeeping
        gnames = ['Actb', 'Gapdh']
        ind = ds.featuresheet.loc[ds.featuresheet['GeneName'].isin(
            gnames)].index
        dsind = Dataset(
            counts_table=ds.counts.loc[ind],
            samplesheet=ds.samplesheet,
            featuresheet=ds.featuresheet.loc[ind],
        )
        dsind.rename(axis='features', column='GeneName', inplace=True)
        for gname, ax in zip(gnames, axs[4:]):
            dd = dsind.counts.log().loc[[gname]].T
            dd['biosample'] = dsind.samplesheet['biosample']
            sns.violinplot(
                data=dd,
                x='biosample',
                y=gname,
                ax=ax,
                zorder=10,
            )
            ax.grid(True)
            ax.set_ylim(-1, 5)
            ax.set_yticks(np.arange(-1, 6))
            ax.set_yticklabels([
                '$0$',
                '$1$',
                '$10$',
                '$10^2$',
                '$10^3$',
                '$10^4$',
                '$10^5$',
            ])
            ax.set_ylabel('{:} per million reads'.format(gname))
            ax.set_xlabel('')
            for tk in ax.get_xticklabels():
                tk.set_rotation(300)

        return fig
Exemplo n.º 17
0
    featuresheet = FeatureSheet(pd.read_csv(fn, sep='\t', index_col=0))

    print('Read sample metadata')
    fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv'
    samplesheet = SampleSheet(
        pd.read_csv(
            fn,
            sep='\t',
            index_col=0,
            dtype={0: str},
        ))

    print('Build dataset')
    ds = Dataset(
        counts_table=counts,
        featuresheet=featuresheet,
        samplesheet=samplesheet,
    )

    def plot_qcs(ds, virus_threshold=60):
        fig, axs = plt.subplots(2, 3, figsize=(12, 7))
        axs = axs.ravel()

        # Number of reads
        ax = axs[0]
        col = 'n_reads'
        col_label = 'Number of reads'
        plot_cumulative(0.1 + ds.samplesheet[col],
                        label='all cells',
                        color='k',
                        ax=ax)
Exemplo n.º 18
0
#!/usr/bin/env python
# vim: fdm=indent
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(
            samplesheet='example_sheet_tsv',
            counts_table='example_table_tsv')

    print('Query samples by metadata')
    ds_tmp = ds.query_samples_by_metadata(
            'experiment == "test_pipeline"',
            inplace=False)
    assert(tuple(ds_tmp.samplenames) == ('test_pipeline',))
    print('Done!')

    print('Query sample by counts in one gene')
    ds_tmp = ds.query_samples_by_counts('KRIT1 > 100', inplace=False)
    assert(tuple(ds_tmp.samplenames) == ('third_sample',))
    print('Done!')

    print('Query sample by total counts')
    ds_tmp = ds.query_samples_by_counts('total < 3000000', inplace=False)
Exemplo n.º 19
0
def ds():
    from singlet.dataset import Dataset
    return Dataset(samplesheet='example_sheet_tsv',
                   counts_table='example_table_tsv')
Exemplo n.º 20
0
#!/usr/bin/env python
# vim: fdm=indent
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
import numpy as np

# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')

    print('Test feature selection by expression')
    res = ds.feature_selection.expressed(n_samples=1, exp_min=1)
    assert (res[0] == 'TSPAN6')
    print('Done!')

    print('Test feature selection by expression, in place')
    dsp = ds.copy()
    dsp.feature_selection.expressed(n_samples=1, exp_min=1, inplace=True)
    assert (dsp.featurenames[0] == 'TSPAN6')
    print('Done!')

    print('Test feature selection by overdispersed strata')
    res = ds.feature_selection.overdispersed_strata()
    assert (res[-1] == 'GLIPR2')
Exemplo n.º 21
0
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test examples on PBMCs.
'''
import sys
import matplotlib.pyplot as plt
import numpy as np

# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(counts_table='example_PBMC')

    # Normalize
    ds.counts.normalize(method='counts_per_million', inplace=True)
    ds.counts.log(inplace=True)

    # Select features
    ds.feature_selection.expressed(n_samples=3, exp_min=1, inplace=True)
    ds.feature_selection.overdispersed_strata(n_features_per_stratum=20,
                                              inplace=True)

    # Reduce dimensionality
    vs = ds.dimensionality.tsne(n_dims=2, theta=0.5, perplexity=0.8)

    dsr = ds.copy()
    dsr.counts = vs.T
Exemplo n.º 22
0
        ))

    print('Read sample metadata')
    fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv'
    samplesheet = SampleSheet(
        pd.read_csv(
            fn,
            sep='\t',
            index_col=0,
            dtype={0: str},
        ))

    print('Build dataset')
    ds = Dataset(
        counts_table=counts,
        featuresheet=featuresheet,
        samplesheet=samplesheet,
    )

    print('Add normalized virus counts')
    ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[
        'n_reads_virus'] / ds.samplesheet['n_reads']
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)
Exemplo n.º 23
0
        ))

    print('Read sample metadata')
    fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv'
    samplesheet = SampleSheet(
        pd.read_csv(
            fn,
            sep='\t',
            index_col=0,
            dtype={0: str},
        ))

    print('Build dataset')
    ds = Dataset(
        counts_table=counts,
        featuresheet=featuresheet,
        samplesheet=samplesheet,
    )

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)

    if not args.keep2:
        print('Filter out sample 2-uninfected (low quality)')
        ds.query_samples_by_metadata('biosample != "2-uninfected"',
                                     inplace=True)

    print('Add normalized virus counts')
Exemplo n.º 24
0
import os
import sys
import pysam
import numpy as np
import pandas as pd
import xarray as xr
from collections import Counter, defaultdict
from Bio import SeqIO

os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
sys.path.append('/home/fabio/university/postdoc/singlet')
from singlet.dataset import Dataset, CountsTable

if __name__ == '__main__':

    ds = Dataset(samplesheet='dengue', )

    ## Histogram of SNVs
    #n_lines = {}
    #n_lines_hist = Counter()
    #fdn = '../bigdata/DENV_singleCellVCF'
    #for fn in os.listdir(fdn):
    #    with pysam.VariantFile('{:}/{:}'.format(fdn, fn), 'r') as f:
    #        nl = sum(1 for line in f)
    #    n_lines[fn] = nl
    #    n_lines_hist[nl] += 1

    ## Example file
    #fdn = '../bigdata/DENV_singleCellVCF'
    #fn_ex = 'vars1001700612_I2.vcf'
    #f = pysam.VariantFile('{:}/{:}'.format(fdn, fn_ex), 'r')
Exemplo n.º 25
0
import argparse
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
sys.path.append('/home/fabio/university/postdoc/singlet')
from singlet.dataset import Dataset, CountsTable

# Script
if __name__ == '__main__':

    ds = Dataset(samplesheet='dengue', )
    data = xr.open_dataset('../bigdata/allele_frequencies.nc')
    ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0))

    # Sync with Felix metadata
    with open('../data/metadataD_SNV_with_tsne.pkl', 'rb') as ff:
        metadata_felix = pickle.load(ff)
    samples = metadata_felix.index[(
        ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)]
    ds.samplesheet = ds.samplesheet.loc[samples]
    metadata_felix = metadata_felix.loc[samples]
    for col in [
            'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10',
            'tsne2_MOI1_10'
    ]:
        ds.samplesheet[col] = metadata_felix[col]
Exemplo n.º 26
0
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
sys.path.append('/home/fabio/university/postdoc/singlet')
from singlet.dataset import Dataset

# Script
if __name__ == '__main__':

    ds = Dataset(
        counts_table='dengue',
        samplesheet='virus',
        featuresheet='humanGC38',
    )
    ds.query_samples_by_counts('total >= 50000', inplace=True)

    ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True)
    cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0)
    ds.counts.normalize('counts_per_million', inplace=True)

    n = ds.samplesheet['numberDengueReads'].astype(int)
    ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n)
    ds.counts.log(inplace=True)

    # Only select cells without virus
    ds.query_samples_by_metadata('virus_reads_per_million < 0.1', inplace=True)
Exemplo n.º 27
0
#!/usr/bin/env python
# vim: fdm=indent
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
import numpy as np

# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')

    print('KNN graph via all pair comparisons')
    res = ds.graph.lshknn(
        axis='samples',
        n_neighbors=1,
        threshold=0.2,
        n_planes=128,
        slice_length=None,
    )
    assert (np.allclose(
        res.data,
        [0.9996988186962041, 1.0, 1.0, 1.0, 0.9996988186962041, 1.0, 1.0, 1.0],
        rtol=1e-02,
        atol=1e-02))
    print('Done!')
Exemplo n.º 28
0
#!/usr/bin/env python
# vim: fdm=indent
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    print('Instantiating Dataset')
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    print('Done!')

    print('Testing Dataset.__str__')
    assert (str(ds) == 'Dataset with 4 samples and 60721 features')
    print('Done!')

    print('Testing Dataset.__repr__')
    assert (ds.__repr__() == '<Dataset: 4 samples, 60721 features>')
    print('Done!')

    print('Testing Dataset.copy')
    assert (ds.copy() == ds)
    print('Done!')

    print('Testing Dataset.copy with modifications')
Exemplo n.º 29
0
            if iax1 == iax2:
                ax.set_facecolor(list(colors[indu[iax1]]) + [0.2])
            ax.grid(True)
            ax.set_xlim(0, 3.9)
            ax.set_xticks([0, 1, 2, 3, 4])
            ax.set_xticks([0.5, 1.5, 2.5, 3.5], minor=True)
            ax.set_yticklabels([])
    fig.text(0.52, 0.02, 'cluster #', ha='center')
    fig.text(0.02, 0.52, 'cluster #', va='center', rotation=90)
    fig.suptitle('Hamming distance distributions across SNV clusters')
    plt.tight_layout(h_pad=0, w_pad=0, rect=(0.03, 0.03, 1, 0.97))

    # Calculate transciptome distances
    ds = Dataset(
            samplesheet='dengue',
            counts_table='dengue',
            featuresheet='humanGC38',
            )
    ds.samplesheet['cluster_SNV'] = clusters
    ds.counts.normalize(inplace=True)
    ds.rename(axis='features', column='GeneName', inplace=True)
    ds.feature_selection.unique(inplace=True)

    # Restrict to differentially expresse genes
    with open('../data/genes_diff_expressed_clustersSNV.tsv', 'rt') as f:
        genes = f.read().split('\t')
    dsd = ds.query_features_by_name(genes)
    dsd.counts.log(inplace=True)

    dsp = dsd.split('cluster_SNV')
    dclut = {}