def test_gtex_reindex(expression_data): """Test that all of the transforms reindex to GTEx properly.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) tpm = norm.tpm_from_counts(expression_data.counts) clr = norm.clr_from_tpm(tpm, imputer=normalize.impute) assert (tpm.columns == clr.columns).all() assert (tpm.columns == norm.gene_lengths.index).all()
def test_clr_functions(expression_data): """Test the TPM -> CLR and CLR -> TPM transforms for some expression data.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) tpm = normalize.impute(expression_data.tpm) clr = norm.clr_from_tpm(tpm, gene_list=tpm.columns) tpm_from_clr = norm.tpm_from_clr(clr, gene_list=clr.columns) assert np.allclose(tpm, tpm_from_clr)
def test_normalizer_tpm_from_rpkm_allgenes(expression_data): """Test the RPKM -> TPM conversion for some expression data. Include all genes.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) rpkm = expression_data.rpkm tpm = expression_data.tpm tpm_calc = norm.tpm_from_rpkm(rpkm) assert (tpm.index == tpm_calc.index).all() assert np.allclose(tpm.values, tpm_calc[expression_data.tpm.columns].values)
def test_zscore_from_clr(expression_data): """Test the z-score transformation on CLR data.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) tpm = normalize.impute(expression_data.tpm) clr = norm.clr_from_tpm(tpm, gene_list=tpm.columns) tissues = pd.Series('Liver', index=clr.index) zscore = norm.z_score_from_clr(clr, tissues) assert zscore.shape == clr.shape
def test_normalizer_tpm_from_counts(expression_data): """Test the counts -> TPM conversion for some expression data.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) counts = expression_data.counts tpm = expression_data.tpm tpm_calc = norm.tpm_from_counts(counts) assert (tpm.columns == tpm_calc.columns).all() assert (tpm.index == tpm_calc.index).all() assert np.allclose(tpm.values, tpm_calc.values)
def test_normalizer_tpm_from_subset(expression_data): """Test the TPM -> TPM subset conversion for some expression data.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) tpm = expression_data.tpm tpm_fullset_calc = norm.tpm_from_subset(tpm) assert np.allclose(tpm.values, tpm_fullset_calc.values) tpm_subset_calc = norm.tpm_from_subset(tpm, tpm.columns[:100]) tpm_subset_norm = tpm_subset_calc.sum(axis=1).values assert np.allclose(tpm_subset_norm - 1e6, np.zeros_like(tpm_subset_norm))
def test_ordinalize(expression_data): """Test the ordinalize transformation on CLR data.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) tpm = normalize.impute(expression_data.tpm) clr = norm.clr_from_tpm(tpm, gene_list=tpm.columns) cutoffs = [0.37] min_value = 5 ords = norm.ordinalize(clr, cutoffs, min_value=min_value) assert ((clr <= cutoffs[0]) == (ords == min_value)).all().all() assert ((clr > cutoffs[0]) == (ords == 1+min_value)).all().all()
def test_alr_functions_dirimpute(expression_data): """Test the TPM -> ALR transform for some expression data. Directly impute in the alr calculation.""" identifier = 'symbol' norm = normalize.Normalizer(identifier=identifier) all_genes = list(expression_data.tpm.columns) reference_genes = all_genes[:1] genes_to_keep = all_genes[1:] alr = norm.alr_from_tpm(expression_data.tpm, reference_genes, gene_list=all_genes, imputer=normalize.impute) alr_genes = list(alr.columns) assert (genes_to_keep == alr_genes)
def expression_data(): """Create some example data.""" num_samples = 100 num_genes = 1000 max_read_count = 1234 counts = pd.DataFrame(np.round(max_read_count*np.random.rand(num_samples, num_genes))) # create a Normalizer object and get gene lengths norm = normalize.Normalizer(identifier='symbol') gene_lengths = norm.gene_lengths[:num_genes] counts.columns = gene_lengths.index # TPM tpk = counts.divide(gene_lengths/1e3, axis='columns') tpm = tpk.divide(tpk.sum(axis=1)/1e6, axis='index') # RPKM cpm = counts.divide(counts.sum(axis=1)/1e6, axis='index') rpkm = cpm.divide(gene_lengths/1e3, axis='columns') return ExpressionData(counts, tpm, rpkm)