def test_htcount_split_by_rpm(): h = HTCountFilter(r"all_expr.csv") high_truth = general.load_csv(r"all_expr_above60_rpm.csv", 0) low_truth = general.load_csv(r"all_expr_below60_rpm.csv", 0) high, low = h.split_by_rpm(threshold=60) assert np.all(high.df == high_truth) assert np.all(low.df == low_truth)
def test_htcount_filter_biotype(): truth_protein_coding = general.load_csv('all_expr_biotype_protein_coding.csv', 0) truth_pirna = general.load_csv('all_expr_biotype_piRNA.csv', 0) h = HTCountFilter("all_expr_biotype.csv") protein_coding = h.filter_biotype(ref='extendedCelegansIDs_bigTable_39col_2019format_edit.csv', inplace=False) pirna = h.filter_biotype('piRNA', ref='extendedCelegansIDs_bigTable_39col_2019format_edit.csv', inplace=False) assert np.all(truth_protein_coding == protein_coding.df) assert np.all(truth_pirna == pirna.df)
def test_deseq_filter_fold_change_direction(): pos_truth = general.load_csv("test_deseq_fc_pos_truth.csv", 0) neg_truth = general.load_csv("test_deseq_fc_neg_truth.csv", 0) d = DESeqFilter("test_deseq_fc.csv") pos = d.filter_fold_change_direction('pos', inplace=False) neg = d.filter_fold_change_direction('neg', inplace=False) assert np.all(pos.df == pos_truth) assert np.all(neg.df == neg_truth)
def test_deseq_split_fold_change(): d = DESeqFilter("test_deseq_fc.csv") pos_truth = general.load_csv("test_deseq_fc_pos_truth.csv", 0) neg_truth = general.load_csv("test_deseq_fc_neg_truth.csv", 0) d = DESeqFilter("test_deseq_fc.csv") pos, neg = d.split_fold_change_direction() assert np.all(pos.df == pos_truth) assert np.all(neg.df == neg_truth)
def test_htcount_filter_biotype_opposite(): truth_no_pirna = general.load_csv(r'all_expr_biotype_no_piRNA.csv', 0) h = HTCountFilter("all_expr_biotype.csv") h.filter_biotype('piRNA', opposite=True, inplace=True) h.df.sort_index(inplace=True) truth_no_pirna.sort_index(inplace=True) assert np.all(h.df == truth_no_pirna)
def test_deseq_filter_abs_fold_change(): truth = general.load_csv("test_deseq_fc_4_truth.csv", 0) d = DESeqFilter("test_deseq_fc.csv") fc4 = d.filter_abs_fold_change(4, inplace=False) fc4.df.sort_index(inplace=True) truth.sort_index(inplace=True) assert np.all(fc4.df == truth)
def import_target(self): """ import the target filename into a pandas DataFrame """ self.target_df = general.load_csv(self.target_filename, drop_gene_names=False) self.target_wbgene = [''] * self.target_df.shape[0]
def test_filter_by_bigtable_group_union(): union_truth = general.load_csv(r'all_expr_filter_by_bigtable_union_truth.csv', 0) h = HTCountFilter('all_expr_filter_by_bigtable.csv') union = h.filter_by_bigtable_group(['epigenetic_related_genes', 'P_granule_proteins'], mode='union', ref='extendedCelegansIDs_bigTable_39col_2019format_edit.csv', inplace=False) union.df.sort_index(inplace=True) union_truth.sort_index(inplace=True) assert np.all(union.df == union_truth)
def test_deseq_filter_significant_opposite(): truth = general.load_csv(r'test_deseq_not_sig_truth.csv', 0) d = DESeqFilter("test_deseq_sig.csv") d.filter_significant(alpha=0.05, opposite=True) d.df.sort_index(inplace=True) truth.sort_index(inplace=True) truth.fillna(1234567890,inplace=True) d.df.fillna(1234567890,inplace=True) assert np.all(d.df == truth)
def test_filter_inplace(): d = DESeqFilter('test_deseq_no_nans.csv') d_copy = DESeqFilter('test_deseq_no_nans.csv') truth = general.load_csv('all_expr.csv') d_inplace_false = d._inplace(truth, opposite=False, inplace=False, suffix='suffix') assert np.all(d_inplace_false.df == truth) assert np.all(d.df == d_copy.df) d._inplace(truth, opposite=False, inplace=True, suffix='other_suffix') assert np.all(d.df == truth)
def test_filter_low_rpm_reverse(): h = HTCountFilter(r"all_expr.csv") low_truth = general.load_csv(r"all_expr_below60_rpm.csv", 0) h.filter_low_rpm(threshold=60, opposite=True) h.df.sort_index(inplace=True) low_truth.sort_index(inplace=True) print(h.shape) print(low_truth.shape) print(h.df) print(low_truth) assert np.all(h.df == low_truth)
def read_reference(self): """ load the reference file into a pandas DataFrame """ ref = general.load_csv(self.reference_filename, drop_gene_names=False) self.gene_symbol_dict = { name: wbgene for name, wbgene in zip(ref[self.ref_gene_col], ref[ self.ref_wbgene_col]) if not pd.isna(name) } self.sequence_dict = { name: wbgene for name, wbgene in zip(ref[self.ref_seq_col], ref[ self.ref_wbgene_col]) if not pd.isna(name) } split_other_id = ref[self.ref_other_col].str.split(pat=";") for namelst, wbgene in zip(split_other_id, ref[self.ref_wbgene_col]): if isinstance(namelst, list): for name in namelst: self.other_id_dict[name] = wbgene
def test_deseq_filter_significant(): truth = general.load_csv("test_deseq_sig_truth.csv", 0) d = DESeqFilter("test_deseq_sig.csv") d.filter_significant(alpha=0.05) assert np.all(d.df == truth)
def test_filter_low_rpm(): truth = general.load_csv("all_expr_low_rpm_truth.csv", 0) h = HTCountFilter("all_expr_low_rpm.csv") h.filter_low_rpm(threshold=5) assert np.isclose(truth, h.df).all()
def test_deseq_filter_top_n(): truth = general.load_csv("test_deseq_top10.csv", 0) d = DESeqFilter("test_deseq.csv") d.filter_top_n(10) assert np.isclose(truth, d.df).all()
def test_htcountfilter_norm_reads_to_rpm(): truth = general.load_csv(r"test_norm_reads_rpm.csv", 0) h = HTCountFilter(r"all_expr.csv") h.norm_reads_to_rpm(r"all_feature.csv") assert np.isclose(truth, h.df).all()
def enrich_big_table(self, attributes: list = None, fdr: float = 0.05, reps=10000, biotype: str = 'protein_coding', big_table_pth: str = __bigtable_path__, save_csv: bool = True, fname=None): """ Calculates enrichment scores, p-values and q-values \ for enrichment and depletion of selected attributes from the Big Table. \ P-values are calculated using a randomization test, and corrected for multiple comparisons using \ the Benjamini–Hochberg step-up procedure (original FDR method). \ Enrichment/depletion is determined automatically by the calculated enrichment score: \ if log2(enrichment score) is positive then enrichment is assumed, \ and if log2(enrichment score) is negative then depletion is assumed. :param attributes: An iterable of attribute names (strings). If None, a manual input prompt will be raised. :param fdr: float. Indicates the FDR threshold for significance. :param reps: How many repetitions to run the randomization for. \ 10,000 is the default. Recommended 10,000 or higher. :param big_table_pth: the path of the Big Table file to be used as reference. :param biotype: the biotype you want your reference to have. 'all' will include all biotypes, \ 'protein_coding' will include only protein-coding genes in the reference, etc. :param save_csv: bool. If True, will save the results to a .csv file, under the name specified in 'fname'. :param fname: str/Path. The full path and name of the file to which to save the results. For example: \ r'C:\dir\file'. No '.csv' suffix is required. If None (default), fname will be requested in a manual prompt. :return: a pandas DataFrame with the indicated attribute names as rows/index, and the columns 'log2_enrichment_score' and 'pvalue'. .. figure:: bigtable_en.png :align: center :scale: 40 % Example plot of big table enrichment """ if attributes is None: attributes = self._from_string( "Please insert attributes separated by newline " "(for example: \n'epigenetic_related_genes\nnrde-3 targets\nALG-3/4 class small RNAs')" ) elif isinstance(attributes, str): attributes = [attributes] else: assert isinstance( attributes, (list, tuple, set)), "'attributes' must be a list, tuple or set!" try: big_table = general.load_csv(big_table_pth, 0) except: raise ValueError("Invalid or nonexistent big table path!") assert (isinstance(biotype, str)) if biotype == 'all': pass else: big_table = big_table[big_table['bioType'] == biotype] fraction = lambda mysrs: (mysrs.shape[0] - mysrs.isna().sum() ) / mysrs.shape[0] enriched_list = [] for k, attribute in enumerate(attributes): assert isinstance( attribute, str ), f"Error in attribute {attribute}: attributes must be strings!" print(f"Finished {k} attributes out of {len(attributes)}") srs = big_table[attribute] obs_srs = srs.loc[self.gene_set] expected_fraction = fraction(srs) observed_fraction = fraction(obs_srs) log2_enrichment_score = np.log2( (observed_fraction + 0.0001) / (expected_fraction + 0.0001)) success = sum((fraction(srs.loc[np.random.choice( srs.index, obs_srs.shape[0], replace=False)]) >= observed_fraction if log2_enrichment_score >= 0 else fraction(srs.loc[np.random.choice( srs.index, obs_srs.shape[0], replace=False)]) <= observed_fraction for rep in range(reps))) pval = (success + 1) / (reps + 1) n = obs_srs.shape[0] enriched_list.append( (attribute, n, int(n * observed_fraction), n * expected_fraction, log2_enrichment_score, pval)) enriched_df = pd.DataFrame(enriched_list, columns=[ 'name', 'samples', 'n obs', 'n exp', 'log2_enrichment_score', 'pvals' ]) significant, padj = multitest.fdrcorrection( enriched_df['pvals'].values, alpha=fdr) enriched_df['padj'] = padj enriched_df['significant'] = significant enriched_df.set_index('name', inplace=True) self._plot_enrich_big_table(enriched_df) if save_csv: self._enrichment_save_csv(enriched_df, fname) print(enriched_df) return enriched_df