def typeI_table(n1, n2, ncases, path=None): """Return a table of the m-test statistics under the null hypothesis. The function returns a table containing the value of the m-statistics of `ncases` draws from two populations of size `n1` and `n2` under the null hypothesis that the mean of the two populations is the same. If a table for population sizes `n1` and `n2` with more entries than `ncases` exists, all the stored values are returned. Otherwise, new cases are computed and stored, then returned. Parameters ---------- n1 : number of samples in population 1 n2 : number of samples in population 2 ncases : number of populations to generate path : path to the m-test tables (see `get_tables_path`) Returns ------- test_values : 1D array of m-test statistics, containing *at least* `ncases` elements, but possibly more """ fname = os.path.join(get_tables_path(path), TABLESNAME%(n1,n2)) if os.path.exists(fname): logging.debug('Loading type I table %s', fname) npzfile = sp.load(fname) test_values = npzfile['test_values'].flatten() else: test_values = sp.array([]) nvalues = test_values.shape[0] if nvalues>=ncases: return test_values nmissing = ncases-nvalues # compute missing entries if nmissing > 0: logging.debug('Requested %d cases, found %d, missing %d', ncases, nvalues, nmissing) print 'The requested mtest table is incomplete.' print ('Need to process %d additional cases, this may take some time.' % nmissing) missing_values = sp.zeros((nmissing,)) pop1_test, pop2_test = _random_same_mean(n1, n2, nmissing) for i in progressinfo(range(nmissing), style='timer'): missing_values[i] = mtest_marginal_likelihood_ratio(pop1_test[i,:], pop2_test[i,:], nprior=_NPRIOR) # update and save table test_values = sp.concatenate((test_values, missing_values)) logging.debug('Saving updated table %s', fname) sp.savez(fname, test_values=test_values) return test_values
def typeII_table(n1, n2, ncases, mean, std, path=None): """Return a table of the m-test statistics under a specific hypothesis. The function returns a table containing the value of the m-statistics and (for comparison) the t-statistics (independent t-test) of `ncases` draws from two populations of size `n1` and `n2`, the first with distribution Normal(mean, std^2), and the second with distribution Normal(0, 1). The table is used to compute the power of the test under different conditions. If a table for population sizes `n1` and `n2` with more entries than `ncases` exists, all the stored values are returned. Otherwise, new cases are computed and stored, then returned. Parameters ---------- n1 : number of samples in population 1 n2 : number of samples in population 2 ncases : number of populations to generate mean -- mean of population 1 std -- standard deviation of population 1 path : path to the m-test tables (see `get_tables_path`) Returns ------- m_test_values : 1D array of m-test statistics, containing *at least* `ncases` elements, but possibly more t_test_values : 1D array of t-test statistics, containing *at least* `ncases` elements, but possibly more """ fname = os.path.join(get_tables_path(path), TYPEII_TABLESNAME%(n1,n2,mean,std)) if os.path.exists(fname): logging.debug('Loading type I table %s', fname) npzfile = sp.load(fname) m_test_values = npzfile['m_test_values'].flatten() t_test_values = npzfile['t_test_values'].flatten() else: m_test_values = sp.array([]) t_test_values = sp.array([]) nvalues = m_test_values.shape[0] if nvalues>=ncases: return m_test_values, t_test_values nmissing = ncases-nvalues if nmissing > 0: logging.debug('Requested %d cases, found %d, missing %d', ncases, nvalues, nmissing) print 'The requested mtest table is incomplete.' print ('Need to process %d additional cases, this may take some time.' % nmissing) # compute missing entries pop1_test, pop2_test = _random_different_mean(n1, n2, nmissing, mean, std) m_missing_values = sp.zeros((nmissing,)) t_missing_values = sp.zeros((nmissing,)) for i in progressinfo(range(nmissing), style='timer'): m_missing_values[i] = mtest_marginal_likelihood_ratio( pop1_test[i,:], pop2_test[i,:], nprior=_NPRIOR) t_missing_values[i] = stats.ttest_ind(pop1_test[i,:], pop2_test[i,:])[1] # update and save table m_test_values = sp.concatenate((m_test_values, m_missing_values)) t_test_values = sp.concatenate((t_test_values, t_missing_values)) logging.debug('Saving updated table %s', fname) sp.savez(fname, m_test_values=m_test_values, t_test_values=t_test_values) return m_test_values, t_test_values
def dofile(filepath): polc = 0 comc = 0 inserts = 0 #infile = file(filepath).read() fi = FileInfo(filepath) print fi.FileName if not fi.sanity_check() and not skip_sanity: print 'Sanity Check Failed!', fi.SanityFail, fi.CycleNumber print 'Skipping this file' return #sys.exit(1) if not skip_fileinfo: try: file_id = fi.addfileinfo() except sa.exceptions.IntegrityError: print 'Already processed this file' return else: file_id = 0 infile = fi.in_lines if use_mdp: iter = progress_bar.progressinfo(infile) else: iter = infile start = time.time() ROWCHUNKS = 10000 rc = 0 comsql = [] polsql = [] polrowsleft = fi.NumPolRecs comrowsleft = fi.NumComRecs for line in iter: rc += 1 line = line.strip() if line[0:2] == '10': polc += 1 polrowsleft -= 1 dict = poltranscols polsql.append(line2sqldict(line, dict, file_id)) if math.fmod(len(polsql), ROWCHUNKS) == 0 or polrowsleft <= ROWCHUNKS: inserts += write2table(polsql, dict) polsql = [] elif line[0:2] == '20': comc += 1 comrowsleft -= 1 dict = comtranscols comsql.append(line2sqldict(line, dict, file_id)) if math.fmod(len(comsql), ROWCHUNKS) == 0 or comrowsleft <= ROWCHUNKS: inserts += write2table(comsql, dict) comsql = [] elif line[0:2] == '00': dict = headcols elif line[0:2] == '99': dict = tailcols else: print 'Unidentified line!' sys.exit(1) del iter, infile ttime = time.time() - start print '%s: comrecs: %d, polrec: %d, total: %d, inserts: %d (time=%d)' \ % (fi.FileName, comc, polc, comc+polc, inserts, ttime) if not fi.final_check(polc, comc): print 'Failed Final Check!', fi.SanityFail sys.exit(1) fi.set_checksum()