def chisq_test(self, shuffled=0): """performs the chisq test Parameters ---------- shuffled pvalue is estimated via resampling from the observed data, preserving the marginals """ stat = calc_chisq(self.observed.array, self.expected.array) if not shuffled: pval = chisqprob(stat, self.df) else: pval = estimate_pval(self.observed.array, calc_chisq, num_reps=shuffled) title = "Chisq-test for independence" return TestResult( self.observed, self.expected, self.residuals, "chisq", stat, self.df, pval, test_name=title, )
def pvalue(self): """returns p-value from chisqprob(LR, df) None if LR < 0""" if self.LR == 0: pvalue = 1 elif self.LR > 0: pvalue = chisqprob(self.LR, self.df) else: pvalue = None return pvalue
def G_independence(self, pseudo_count=0, williams=True, shuffled=0): """performs the independence G test Parameters ---------- pseudo_count : int added to observed to avoid zero division williams : bool Applies Williams correction for small sample size shuffled : int pvalue is estimated via resampling shuffled times from the observed data, preserving the marginals """ assert type(pseudo_count) == int, f"{pseudo_count} not an integer" obs = self.observed exp = self.expected if pseudo_count and (obs.array == 0).any(): obs = obs.template.wrap(obs.array + pseudo_count) exp = calc_expected(obs.array) exp = obs.template.wrap(exp) assert type(shuffled) == int, f"{shuffled} not an integer" G = calc_G( obs.array, exp.array, williams=williams, ) if not shuffled: pval = chisqprob(G, self.df) else: pval = estimate_pval(obs.array, calc_G, num_reps=shuffled) title = "G-test for independence" amendments = "" if pseudo_count: amendments = f"pseudo_count={pseudo_count}, " if williams: amendments = f"{amendments}Williams correction" if amendments: title = f"{title} (with {amendments})" result = TestResult( obs, exp, self.residuals, "G", G, self.df, pval, test_name=title, ) return result
def G_independence(self, pseudo_count=0, williams=True, shuffled=0): """performs the independence G test Parameters ---------- pseudo_count : int added to observed to avoid zero division shuffled : int pvalue is estimated via resampling shuffled times from the observed data, preserving the marginals """ assert type(pseudo_count) == int, f"{pseudo_count} not an integer" assert type(shuffled) == int, f"{shuffled} not an integer" G = calc_G( self.observed.array, self.expected.array, pseudo_count=pseudo_count, williams=williams, ) if not shuffled: pval = chisqprob(G, self.df) else: pval = estimate_pval(self.observed.array, calc_G, num_reps=shuffled) title = "G-test for independence" if williams: title = f"{title} (with Williams correction)" result = TestResult( self.observed, self.expected, self.residuals, "G", G, self.df, pval, test_name=title, ) return result
def get_position_effects(table, position_sets, group_label=None): pos_results = {} grouped = group_label is not None if grouped: assert len(table.distinct_values(group_label)) == 2 for position_set in position_sets: if not grouped: counts = motif_count.get_combined_counts(table, position_set) else: counts = get_grouped_combined_counts(table, position_set, group_label=group_label) rel_entropy, deviance, df, stats, formula = \ log_lin.position_effect(counts, group_label=group_label) if deviance < 0: p = 1.0 else: p = chisqprob(deviance, df) pos_results[position_set] = dict(rel_entropy=rel_entropy, deviance=deviance, df=df, stats=stats, formula=formula, prob=p) return pos_results
def main(countsfile, outpath, countsfile2, strand_symmetry, force_overwrite, dry_run, verbose): args = locals() table = LoadTable(countsfile, sep='\t') if not dry_run: log_file_path = os.path.join(util.abspath(outpath), 'spectra_analysis.log') LOGGER.log_file_path = log_file_path LOGGER.log_message(str(args), label='vars') LOGGER.input_file(countsfile) # if there's a strand symmetry argument then we don't need a second file if strand_symmetry: group_label = 'strand' counts_table = util.spectra_table(table, group_label) if not strand_symmetry: group_label = 'group' # be sure there's two files counts_table2 = LoadTable(countsfile2, sep='\t') LOGGER.input_file(countsfile2) counts_table2 = counts_table2.with_new_column('group', lambda x: '2', columns=counts_table2.header[0]) counts_table1 = table.with_new_column('group', lambda x: '1', columns=table.header[0]) counts_table1 = util.spectra_table(counts_table1, group_label) counts_table2 = util.spectra_table(counts_table2, group_label) # now combine header = ['group'] + counts_table2.header[:-1] raw1 = counts_table1.tolist(header) raw2 = counts_table2.tolist(header) counts_table = LoadTable(header=header, rows=raw1 + raw2) if verbose: print(counts_table) # spectra table has [count, start, end, group] order # we reduce comparisons to a start base results = [] saveable = {} for start_base in counts_table.distinct_values('start'): subtable = counts_table.filtered('start == "%s"' % start_base) columns = [c for c in counts_table.header if c != 'start'] subtable = subtable.get_columns(columns) total_re, dev, df, collated, formula = log_lin.spectra_difference( subtable, group_label) r = [list(x) for x in collated.to_records(index=False)] if not strand_symmetry: grp_labels = {'1': countsfile, '2': countsfile2} grp_index = list(collated.columns).index('group') for row in r: row[grp_index] = grp_labels[row[grp_index]] p = chisqprob(dev, df) if p < 1e-6: prob = "%.2e" % p else: prob = "%.6f" % p for row in r: row.insert(0, start_base) row.append(prob) results += r significance = ["RE=%.6f" % total_re, "Dev=%.2f" % dev, "df=%d" % df, "p=%s" % p] stats = " : ".join(significance) print("Start base=%s %s" % (start_base, stats)) saveable[start_base] = dict(rel_entropy=total_re, deviance=dev, df=df, prob=p, formula=formula, stats=collated.to_json()) table = LoadTable(header=['start_base'] + list(collated.columns) + ['prob'], rows=results, digits=5).sorted(columns='ret') json_path = None outpath = util.abspath(outpath) if not dry_run: util.makedirs(outpath) json_path = os.path.join(outpath, 'spectra_analysis.json') dump_json(saveable, json_path) LOGGER.output_file(json_path) table_path = os.path.join(outpath, 'spectra_summary.txt') table.write(table_path, sep='\t') LOGGER.output_file(table_path) LOGGER.log_message(str(significance), label="significance")