예제 #1
0
def test_htcount_split_by_rpm():
    h = HTCountFilter(r"all_expr.csv")
    high_truth = general.load_csv(r"all_expr_above60_rpm.csv", 0)
    low_truth = general.load_csv(r"all_expr_below60_rpm.csv", 0)
    high, low = h.split_by_rpm(threshold=60)
    assert np.all(high.df == high_truth)
    assert np.all(low.df == low_truth)
예제 #2
0
def test_htcount_filter_biotype():
    truth_protein_coding = general.load_csv('all_expr_biotype_protein_coding.csv', 0)
    truth_pirna = general.load_csv('all_expr_biotype_piRNA.csv', 0)
    h = HTCountFilter("all_expr_biotype.csv")
    protein_coding = h.filter_biotype(ref='extendedCelegansIDs_bigTable_39col_2019format_edit.csv', inplace=False)
    pirna = h.filter_biotype('piRNA', ref='extendedCelegansIDs_bigTable_39col_2019format_edit.csv', inplace=False)
    assert np.all(truth_protein_coding == protein_coding.df)
    assert np.all(truth_pirna == pirna.df)
예제 #3
0
def test_deseq_filter_fold_change_direction():
    pos_truth = general.load_csv("test_deseq_fc_pos_truth.csv", 0)
    neg_truth = general.load_csv("test_deseq_fc_neg_truth.csv", 0)
    d = DESeqFilter("test_deseq_fc.csv")
    pos = d.filter_fold_change_direction('pos', inplace=False)
    neg = d.filter_fold_change_direction('neg', inplace=False)
    assert np.all(pos.df == pos_truth)
    assert np.all(neg.df == neg_truth)
예제 #4
0
def test_deseq_split_fold_change():
    d = DESeqFilter("test_deseq_fc.csv")
    pos_truth = general.load_csv("test_deseq_fc_pos_truth.csv", 0)
    neg_truth = general.load_csv("test_deseq_fc_neg_truth.csv", 0)
    d = DESeqFilter("test_deseq_fc.csv")
    pos, neg = d.split_fold_change_direction()
    assert np.all(pos.df == pos_truth)
    assert np.all(neg.df == neg_truth)
예제 #5
0
def test_htcount_filter_biotype_opposite():
    truth_no_pirna = general.load_csv(r'all_expr_biotype_no_piRNA.csv', 0)
    h = HTCountFilter("all_expr_biotype.csv")
    h.filter_biotype('piRNA', opposite=True, inplace=True)
    h.df.sort_index(inplace=True)
    truth_no_pirna.sort_index(inplace=True)
    assert np.all(h.df == truth_no_pirna)
예제 #6
0
def test_deseq_filter_abs_fold_change():
    truth = general.load_csv("test_deseq_fc_4_truth.csv", 0)
    d = DESeqFilter("test_deseq_fc.csv")
    fc4 = d.filter_abs_fold_change(4, inplace=False)
    fc4.df.sort_index(inplace=True)
    truth.sort_index(inplace=True)
    assert np.all(fc4.df == truth)
예제 #7
0
 def import_target(self):
     """
     import the target filename into a pandas DataFrame
     """
     self.target_df = general.load_csv(self.target_filename,
                                       drop_gene_names=False)
     self.target_wbgene = [''] * self.target_df.shape[0]
예제 #8
0
def test_filter_by_bigtable_group_union():
    union_truth = general.load_csv(r'all_expr_filter_by_bigtable_union_truth.csv', 0)
    h = HTCountFilter('all_expr_filter_by_bigtable.csv')
    union = h.filter_by_bigtable_group(['epigenetic_related_genes', 'P_granule_proteins'], mode='union',
                                       ref='extendedCelegansIDs_bigTable_39col_2019format_edit.csv', inplace=False)
    union.df.sort_index(inplace=True)
    union_truth.sort_index(inplace=True)
    assert np.all(union.df == union_truth)
예제 #9
0
def test_deseq_filter_significant_opposite():
    truth = general.load_csv(r'test_deseq_not_sig_truth.csv', 0)
    d = DESeqFilter("test_deseq_sig.csv")
    d.filter_significant(alpha=0.05, opposite=True)
    d.df.sort_index(inplace=True)
    truth.sort_index(inplace=True)
    truth.fillna(1234567890,inplace=True)
    d.df.fillna(1234567890,inplace=True)
    assert np.all(d.df == truth)
예제 #10
0
def test_filter_inplace():
    d = DESeqFilter('test_deseq_no_nans.csv')
    d_copy = DESeqFilter('test_deseq_no_nans.csv')
    truth = general.load_csv('all_expr.csv')
    d_inplace_false = d._inplace(truth, opposite=False, inplace=False, suffix='suffix')
    assert np.all(d_inplace_false.df == truth)
    assert np.all(d.df == d_copy.df)
    d._inplace(truth, opposite=False, inplace=True, suffix='other_suffix')
    assert np.all(d.df == truth)
예제 #11
0
def test_filter_low_rpm_reverse():
    h = HTCountFilter(r"all_expr.csv")
    low_truth = general.load_csv(r"all_expr_below60_rpm.csv", 0)
    h.filter_low_rpm(threshold=60, opposite=True)
    h.df.sort_index(inplace=True)
    low_truth.sort_index(inplace=True)
    print(h.shape)
    print(low_truth.shape)
    print(h.df)
    print(low_truth)

    assert np.all(h.df == low_truth)
예제 #12
0
    def read_reference(self):
        """
        load the reference file into a pandas DataFrame
        """
        ref = general.load_csv(self.reference_filename, drop_gene_names=False)
        self.gene_symbol_dict = {
            name: wbgene
            for name, wbgene in zip(ref[self.ref_gene_col], ref[
                self.ref_wbgene_col]) if not pd.isna(name)
        }
        self.sequence_dict = {
            name: wbgene
            for name, wbgene in zip(ref[self.ref_seq_col], ref[
                self.ref_wbgene_col]) if not pd.isna(name)
        }

        split_other_id = ref[self.ref_other_col].str.split(pat=";")
        for namelst, wbgene in zip(split_other_id, ref[self.ref_wbgene_col]):
            if isinstance(namelst, list):
                for name in namelst:
                    self.other_id_dict[name] = wbgene
예제 #13
0
def test_deseq_filter_significant():
    truth = general.load_csv("test_deseq_sig_truth.csv", 0)
    d = DESeqFilter("test_deseq_sig.csv")
    d.filter_significant(alpha=0.05)
    assert np.all(d.df == truth)
예제 #14
0
def test_filter_low_rpm():
    truth = general.load_csv("all_expr_low_rpm_truth.csv", 0)
    h = HTCountFilter("all_expr_low_rpm.csv")
    h.filter_low_rpm(threshold=5)
    assert np.isclose(truth, h.df).all()
예제 #15
0
def test_deseq_filter_top_n():
    truth = general.load_csv("test_deseq_top10.csv", 0)
    d = DESeqFilter("test_deseq.csv")
    d.filter_top_n(10)
    assert np.isclose(truth, d.df).all()
예제 #16
0
def test_htcountfilter_norm_reads_to_rpm():
    truth = general.load_csv(r"test_norm_reads_rpm.csv", 0)
    h = HTCountFilter(r"all_expr.csv")
    h.norm_reads_to_rpm(r"all_feature.csv")
    assert np.isclose(truth, h.df).all()
예제 #17
0
    def enrich_big_table(self,
                         attributes: list = None,
                         fdr: float = 0.05,
                         reps=10000,
                         biotype: str = 'protein_coding',
                         big_table_pth: str = __bigtable_path__,
                         save_csv: bool = True,
                         fname=None):
        """
        Calculates enrichment scores, p-values and q-values \
        for enrichment and depletion of selected attributes from the Big Table. \
        P-values are calculated using a randomization test, and corrected for multiple comparisons using \
        the Benjamini–Hochberg step-up procedure (original FDR method). \
        Enrichment/depletion is determined automatically by the calculated enrichment score: \
        if log2(enrichment score) is positive then enrichment is assumed, \
        and if log2(enrichment score) is negative then depletion is assumed.

        :param attributes: An iterable of attribute names (strings). If None, a manual input prompt will be raised.
        :param fdr: float. Indicates the FDR threshold for significance.
        :param reps: How many repetitions to run the randomization for. \
        10,000 is the default. Recommended 10,000 or higher.
        :param big_table_pth: the path of the Big Table file to be used as reference.
        :param biotype: the biotype you want your reference to have. 'all' will include all biotypes, \
        'protein_coding' will include only protein-coding genes in the reference, etc.
        :param save_csv: bool. If True, will save the results to a .csv file, under the name specified in 'fname'.
        :param fname: str/Path. The full path and name of the file to which to save the results. For example: \
        r'C:\dir\file'. No '.csv' suffix is required. If None (default), fname will be requested in a manual prompt.
        :return:
        a pandas DataFrame with the indicated attribute names as rows/index, and the columns 'log2_enrichment_score'
        and 'pvalue'.

        .. figure::  bigtable_en.png
           :align:   center
           :scale: 40 %

           Example plot of big table enrichment
        """
        if attributes is None:
            attributes = self._from_string(
                "Please insert attributes separated by newline "
                "(for example: \n'epigenetic_related_genes\nnrde-3 targets\nALG-3/4 class small RNAs')"
            )
        elif isinstance(attributes, str):
            attributes = [attributes]
        else:
            assert isinstance(
                attributes,
                (list, tuple,
                 set)), "'attributes' must be a list, tuple or set!"

        try:
            big_table = general.load_csv(big_table_pth, 0)
        except:
            raise ValueError("Invalid or nonexistent big table path!")

        assert (isinstance(biotype, str))
        if biotype == 'all':
            pass
        else:
            big_table = big_table[big_table['bioType'] == biotype]

        fraction = lambda mysrs: (mysrs.shape[0] - mysrs.isna().sum()
                                  ) / mysrs.shape[0]
        enriched_list = []
        for k, attribute in enumerate(attributes):
            assert isinstance(
                attribute, str
            ), f"Error in attribute {attribute}: attributes must be strings!"
            print(f"Finished {k} attributes out of {len(attributes)}")

            srs = big_table[attribute]
            obs_srs = srs.loc[self.gene_set]
            expected_fraction = fraction(srs)
            observed_fraction = fraction(obs_srs)
            log2_enrichment_score = np.log2(
                (observed_fraction + 0.0001) / (expected_fraction + 0.0001))
            success = sum((fraction(srs.loc[np.random.choice(
                srs.index, obs_srs.shape[0], replace=False)]) >=
                           observed_fraction if log2_enrichment_score >= 0 else
                           fraction(srs.loc[np.random.choice(
                               srs.index, obs_srs.shape[0],
                               replace=False)]) <= observed_fraction
                           for rep in range(reps)))
            pval = (success + 1) / (reps + 1)
            n = obs_srs.shape[0]
            enriched_list.append(
                (attribute, n, int(n * observed_fraction),
                 n * expected_fraction, log2_enrichment_score, pval))

        enriched_df = pd.DataFrame(enriched_list,
                                   columns=[
                                       'name', 'samples', 'n obs', 'n exp',
                                       'log2_enrichment_score', 'pvals'
                                   ])
        significant, padj = multitest.fdrcorrection(
            enriched_df['pvals'].values, alpha=fdr)
        enriched_df['padj'] = padj
        enriched_df['significant'] = significant
        enriched_df.set_index('name', inplace=True)

        self._plot_enrich_big_table(enriched_df)

        if save_csv:
            self._enrichment_save_csv(enriched_df, fname)
        print(enriched_df)
        return enriched_df