def HC_sim(c1, c2, gamma=0.15, randomize=False, pval_thresh=1.1, HCtype='HCstar'): """ Higher-Criticism (HC) similarity of two discrete samples Args: ----- c1, c2 : two lists of integers of equal length gamma : HC parameter randomize : randomized Pvalues or normalization pval_thresh : only use P-values below this value. Has not effect if pval_thresh > 1. Returns: ------- HCstar of the binomial allocation P-values of the two lists """ pvals = two_sample_pvals(c1, c2, randomize=randomize) pvals_red = pvals[pvals < pval_thresh] if len(pvals_red) == 0: return np.nan if HCtype == 'HCstar': hc, _ = HC(pvals_red).HCstar(gamma=gamma) elif HCtype == 'original': hc, _ = HC(pvals_red).HC(gamma=gamma) else: raise ValueError(f"{HCtype} is not a valid value for HCtype") exit(1) return hc
def evaluate_iteration(n = 10, N = 10, ep = .1, mu = 1, xi = 0, metric = 'Hellinger') : logging.debug(f"Evaluating with: n={n}, N={N}, ep={ep}, mu={mu}, xi={xi}") P = power_law(N, xi) if metric == 'Hellinger' : QP = (np.sqrt(P) + np.sqrt(mu))**2 if metric == 'ChiSq' : QP = P + 2 * np.sqrt(P * mu) if metric == 'proportional' : QP = P *( 1 + r * np.log(N)) if metric == 'power' : QP = P * (np.log(N) ** r) smp1 = sample_from_mixture(n*P, n*QP, ep) smp2 = sample_from_mixture(n*P, n*QP, ep) min_cnt = 0 stbl = False gamma = 0.25 pv = two_sample_pvals(smp1, smp2, randomize=True, sym=True) pv = pv[(smp1 == 0) | (smp2 == 0)] if len(pv) > 0 : hc, _ = HC(pv[pv < 1], stbl=stbl).HC(gamma=gamma) MinPv = -np.log(pv.min()) else : print("empty") hc = np.nan MinPv = np.nan pv_NR = two_sample_pvals(smp1, smp2, randomize=False) pv_NR = pv_NR[(smp1 == 0) | (smp2 == 0)] if len(pv_NR) > 0 : hc_NR, _ = HC(pv_NR[pv_NR < 1], stbl=stbl).HC(gamma=gamma) MinPvNR = -np.log(pv_NR.min()) else : print("empty") hc_NR = np.nan MinPvNR = np.nan return {'HC_NR' : hc_NR, 'minPv_NR' : MinPvNR, 'HC' : hc, 'minPv' : MinPv}
def test_doc(self, doc, of_cls=None, **kwrgs): """ Test a new document against existing documents by combining binomial allocation P-values from each document. Params: :doc: dataframe representing terms in the tested doc :of_cls: use this to indicate that the tested document is already represented by one of the classes in the model :stbl: type of HC statistic to use :gamma: parameter of HC statistic """ stbl = kwrgs.get('stbl', self.stbl) gamma = kwrgs.get('gamma', self.gamma) dfi = self.count_words(doc) logging.debug(f"Doc contains {dfi.n.sum()} terms.") df = self.counts_df assert ( len(df) == len(dfi)), "count_words must use the same vocabulary" dfi['tested:T'] = dfi.n.sum() dfi = dfi.rename(columns={'n': 'tested:n'}) df = df.join(dfi, how='left') for cls in self.cls_names: cnt1 = df['tested:n'].astype(int) cnt2 = df[f'{cls}:n'].astype(int) if of_cls == cls: # if tested document is already represented in # corpus, remove its counts to get a meaningful # comparison. logging.debug( f"Doc is of {of_cls}. Evaluating in a Leave-out manner.") cnt2 -= cnt1 assert (np.all(cnt2 >= 0)) if cnt1.sum() + cnt2.sum() > 0: pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True) else: pv, p = cnt1 * np.nan, cnt1 * np.nan df[f'{cls}:pval'] = pv df[f'{cls}:score'] = -2 * np.log(df[f'{cls}:pval']) df[f'{cls}:Fisher'] = df[f'{cls}:score'].mean() df[f'{cls}:HC'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma) df[f'{cls}:chisq'] = two_sample_chi_square(cnt1, cnt2)[0] more = -np.sign(cnt1 - (cnt1 + cnt2) * p) thresh = pv < pth df[f'{cls}:affinity'] = more * thresh return df
def two_sample_pvals_loc(c1, c2, randomize=False, min_cnt=0, pval_type='cell', max_m=-1): if pval_type == 'stripe': logging.debug('Computing stripe P-values.') return binom_var_test(c1, c2, max_m=max_m).values if pval_type == 'cell': logging.debug('Computing cell P-values.') pv_exact = two_sample_pvals(c1, c2, randomize=randomize) return pv_exact[c1 + c2 >= min_cnt] logging.debug('Computing cell and stripe P-values.') pv_bin_var = binom_var_test(c1, c2).values pv_exact = two_sample_pvals(c1, c2, randomize=randomize) pv_exact = pv_exact[c1 + c2 >= min_cnt] pv_all = np.concatenate([pv_bin_var, pv_exact]) return pv_all
def get_pvals(self): if self.num_of_cls < 2: logging.error("Not enough columns.") return np.nan df = self.counts_df.copy() if self.num_of_cls > 2: logging.info("Using multinomial tests. May be slow.") df['x'] = df.filter(regex=r":n$").to_records(index=False).tolist() df['p'] = df.filter(regex=r":T$").to_records(index=False).tolist() pv = df.apply(lambda r: multinomial_test(r['x'], r['p']), axis=1) else: # num_cls == 2 logging.info("Using binomial tests.") pv = two_sample_pvals(df[f"{self.cls_names[0]}:n"], df[f"{self.cls_names[1]}:n"]) df['pval'] = pv return df
def test_cls(self, cls_name, gamma=.2, stbl=True): """ HC Test of one class against the rest. Returns HC value and indicates if feature is selected by HCT """ df1 = pd.DataFrame() col_name_n = f"n ({cls_name})" col_name_T = f"T ({cls_name})" df1 = self.counts_df.filter([col_name_n, col_name_T]) df1['n (rest)'] = self.counts_df['n'] - df1[col_name_n] df1['T (rest)'] = self.counts_df["T"] - df1[col_name_T] df1['pval'] = two_sample_pvals(df1[col_name_n], df1["n (rest)"]) hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma) df1['HC'] = hc df1['thresh'] = df1['pval'] < thr df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \ - df1['n (rest)'] / df1['T (rest)']) return df1
def test_doc(self, doc, of_cls=None, stbl=True, gamma=.2): """ Test a new document against existing documents by combining binomial allocation P-values from each document. """ dfi = self.count_words(doc) logging.debug(f"Doc contains {dfi.n.sum()} terms.") df = self.counts_df dfi['T (test)'] = dfi.n.sum() dfi = dfi.rename(columns={'n': 'n (test)'}) df = df.join(dfi, how='left') for cls in self.cls_names: cnt1 = df['n (test)'].astype(int) cnt2 = df[f'n ({cls})'].astype(int) if of_cls == cls: # if tested document is already represented in # corpus, remove its counts to get a meaningful # comparison. logging.debug( f"Doc is of {of_cls}. Evaluating in Leave-our manner.") print(f"Doc is of {of_cls}. Evaluating in Leave-our manner.") cnt2 -= cnt1 pv, p = two_sample_pvals(cnt1, cnt2, ret_p=True) df[f'pval ({cls})'] = pv df[f'score ({cls})'] = -2 * np.log(df[f'pval ({cls})']) df[f'HC ({cls})'], pth = HC(pv, stbl=stbl).HCstar(gamma=gamma) more = -np.sign(cnt1 - (cnt1 + cnt2) * p) thresh = pv < pth df[f'affinity ({cls})'] = more * thresh return df
def test_cls(self, cls_name, **kwrgs): """ HC Test of one class against the rest. Returns HC value and indicates if feature is selected by HCT """ stbl = kwrgs.get('stbl', self.stbl) gamma = kwrgs.get('gamma', self.gamma) df1 = pd.DataFrame() col_name_n = f"{cls_name}:n" col_name_T = f"{cls_name}:T" df1 = self.counts_df.filter([col_name_n, col_name_T]) df1['rest:n'] = self.counts_df['n'] - df1[col_name_n] df1['rest:T'] = self.counts_df["T"] - df1[col_name_T] df1['pval'] = two_sample_pvals(df1[col_name_n], df1["rest:n"]) hc, thr = HC(df1['pval'], stbl=stbl).HCstar(gamma=gamma) df1['HC'] = hc df1['thresh'] = df1['pval'] < thr df1['more'] = np.sign(df1[col_name_n] / df1[col_name_T] \ - df1['rest:n'] / df1['rest:T']) return df1
def BJ_sim(c1, c2, gamma=0.1, randomize=False, pval_thresh=1.1): """ Berk-Jones (BJ) similarity of two discrete samples Args: ----- c1, c2 : two lists of integers of equal length gamma : lower fraction of P-values randomize : randomized Pvalues or normalization pval_thresh : only use P-values below this value. Has not effect if pval_thresh > 1. Returns: ------- HCstar of the binomial allocation P-values of the two lists """ pvals = two_sample_pvals(c1, c2, randomize=randomize) pvals_red = pvals[pvals < pval_thresh] if len(pvals_red) == 0: return np.nan bj, _ = HC(pvals_red).BJ(gamma=gamma) return bj
def evaluate_iteration(a, xi, r, be, n, nMonte=10, metric = 'Hellinger') : N = int(n ** (1/a)) #n = int(N ** a) P = power_law(N, xi) print("r = {}, beta = {}, a = {}, xi = {}, n = {}".format(r,be,a,xi,n)) ep = N ** (-be) mu = r * np.log(N) / n / 2 df = pd.DataFrame() for iM in range(nMonte) : TH1 = np.random.rand(N) < ep/2 TH2 = np.random.rand(N) < ep/2 if metric == 'Hellinger' : QP = (np.sqrt(P) + np.sqrt(mu))**2 if metric == 'ChiSq' : QP = P + 2 * np.sqrt(P * mu) if metric == 'proportional' : QP = P *( 1 + r * np.log(N)) if metric == 'power' : QP = P * (np.log(N) ** r) Q1 = P.copy() Q1[TH1] = QP[TH1] Q1 = Q1 / Q1.sum() Q2 = P.copy() Q2[TH2] = QP[TH2] Q2 = Q2 / Q2.sum() smp1 = np.random.multinomial(n, Q1) smp2 = np.random.multinomial(n, Q2) smp_P1 = np.random.poisson(lam = n*P) smp_P = smp1 smp_Q = smp2 min_cnt = 0 stbl = False gamma = 0.25 pv = two_sample_pvals(smp_Q, smp_P, randomize=True, sym=True) #pv = pv[smp_Q + smp_P > min_cnt] pv[(smp_Q == 0) | (smp_P == 0)] hc, p_th = hc_vals(pv[pv < 1], gamma = gamma, stbl=stbl, minPv=0) pv_NR = two_sample_pvals(smp_Q, smp_P, randomize=False) hc_NR, _ = hc_vals(pv_NR[pv_NR < 1], gamma = gamma, stbl=stbl, minPv=0) MinPv = -np.log(pv.min()) MinPvNR = -np.log(pv_NR.min()) dfr = pd.DataFrame({'r': [r], 'beta' : [be], 'a' : [a], 'xi' : [xi],'N' : [N], 'n' : [n], 'metric' : metric, 'nMonte' : nMonte, 'HC_NR' : hc_NR, 'minPv_NR' : MinPvNR, 'HC' : hc, 'minPv' : MinPv, }) df = df.append(dfr, ignore_index = True) return df