def get_dataset(tissue, membrane_only=True): counts = parse_counts(tissue) if membrane_only: go = parse_go_plasma_membrane() genes_membrane = go[go.isin(counts.index)] counts = counts.loc[genes_membrane] ds = Dataset( samplesheet=SampleSheet(cell_types), counts_table=CountsTable(counts), ) return ds
def parse_counts(tissue, regenerate=False): import glob if 'annotation glob' in config[tissue]: glb = config[tissue]['annotation glob'] else: glb = tissue if regenerate: cglbs = ('CountTable', ) else: cglbs = ('CountTableNormalized', 'CountTable') for cglb in cglbs: fn_glb = '../../data/MACAtSNE/{:}{:}.csv'.format(glb, cglb) fns = glob.glob(fn_glb) if len(fns): break if len(fns) == 0: raise IOError('Counts file not found for tissue: {:}'.format(tissue)) elif len(fns) > 1: raise IOError( 'Several counts files found for tissue: {:}'.format(tissue)) else: fn = fns[0] out = pd.read_csv(fn, sep=',', index_col=0) if '.' in out.columns[0]: out.columns = [ '{1}_{0}'.format(*(c.split('.')[:2])) for c in out.columns ] out.index.name = 'GeneName' out.columns.name = 'Cell' out = CountsTable(out) if 'Normalized' in fn: out._normalized = 'counts_per_million' else: print('Normalize counts') out.normalize(inplace=True) print('Log counts') out.log(inplace=True) print('Write normalized counts to file') out.to_csv(fn[:-4] + 'Normalized.csv', sep=',') return out
def ct(): from singlet.counts_table import CountsTable return CountsTable.from_tablename('example_table_tsv').iloc[:200]
def get_dataset(tissue, membrane_only=True, regenerate=False, go_contains=None, go_exclude=None): # Some tissues like brain were split for sorting, we merge them here dss = [] for tissue_facs in tissues_prediction[tissue]: cell_types, plates = parse_annotations(tissue_facs) counts = parse_counts(tissue_facs, regenerate=regenerate) if membrane_only: go = parse_go_plasma_membrane().index genes_membrane = go[go.isin(counts.index)] counts = counts.loc[genes_membrane] if (go_contains is not None) and (go_exclude is not None): raise ValueError('Use either go_contains or go_exclude') if go_contains is not None: go = parse_go_plasma_membrane() genes = go.index[go['GONames'].str.contains(go_contains)] genes = np.intersect1d(genes, counts.index) counts = counts.loc[genes] elif go_exclude is not None: go = parse_go_plasma_membrane() genes = go.index[~go['GONames'].str.contains(go_exclude)] genes = np.intersect1d(genes, counts.index) counts = counts.loc[genes] dss.append({'samplesheet': cell_types, 'counts': counts}) if len(dss) == 1: ds = Dataset( samplesheet=SampleSheet(cell_types), counts_table=counts, ) return ds else: # Merging is kind of messy because some genes are absent from either # subtissue (grrr); I put zeroes for now, Michelle is working on the # better solution (we have those numbers somewhere) genes = set() for ds in dss: genes |= set(ds['counts'].index.values) genes = pd.Index(sorted(genes), name=ds['counts'].index.name) for ds in dss: genes_missing = genes[~genes.isin(ds['counts'].index)] for gene in genes_missing: # The stuff is normalized, pseudocounted, and logged ds['counts'].loc[gene] = -1.0 ds['counts'] = ds['counts'].loc[genes] ngenes = len(genes) ncells = sum(ds['samplesheet'].shape[0] for ds in dss) samplesheet_all = pd.concat([ds['samplesheet'] for ds in dss], axis=0) counts_all = pd.DataFrame(np.zeros((ngenes, ncells), float), index=genes, columns=samplesheet_all.index) for ds in dss: counts_all.loc[:, ds['counts'].columns.values] = ds['counts'].values counts_all = CountsTable(counts_all) if ds['counts']._normalized: counts_all._normalized = ds['counts']._normalized ds = Dataset( samplesheet=SampleSheet(samplesheet_all), counts_table=counts_all, ) return ds
#!/usr/bin/env python # vim: fdm=indent ''' author: Fabio Zanini date: 15/08/17 content: Test CountsTable class. ''' # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.counts_table import CountsTable ct = CountsTable.from_tablename('example_table_tsv') print('Test binning of CountsTable') ct = ct.iloc[:200] ct.bin(result='index', inplace=True) assert (ct.values.max() == 4) print('Done!')
def test_initialize(): from singlet.counts_table import CountsTable ct = CountsTable.from_tablename('example_table_tsv')
def test_initialize_fromdataset(): from singlet.counts_table import CountsTable ct = CountsTable.from_datasetname('example_dataset')