def randomisation_test4bigrmas_inSequences(seqOfSeqs, df_obsStat, Nsh, condsLi, sampsLi, testStat=teStat_proportions_diff): """DEPRECATED: use randtest4bigrmas_inSequences """ ## define array to slice superSequences back into the sequences seq_slicer = superSequenceSlicer( seqOfSeqs) #np.cumsum(np.array([len(item) for item in seqOfSeqs])) ## define super sequence vector superSequence = np.array(flattenList(seqOfSeqs)) ## randomisations test nr, nc = np.shape(df_obsStat) shuffle_tests = np.zeros((Nsh, nr, nc)) N_values_r = np.zeros_like(df_obsStat) for i in np.arange(Nsh): ## randomise supersequence np.random.shuffle(superSequence) ## define sequences: slice supersequence and put in str format for nltk sequences_str = aa.seqsLi2iniEndSeq( sliceBackSuperSequence(superSequence, seq_slicer)) ## split sequences into bigrams my_bigrams = list(nltk.bigrams(sequences_str)) ## count bigrams cfd_sh0 = ngr.bigrams2cfd(my_bigrams) ## fill cfd_sh0 with empty valued keys of the missing values cfd_sh = ngr.fill2KyDict(cfd_sh0, kySet=set(sampsLi) | set(condsLi)) ## transform cfd into matrix form Mp_sh = ngr.condFreqDict2condProbMatrix( cfd_sh, condsLi, sampsLi)[0] # normalised matrix shTest_i = testStat(Mp_sh) # compute satat variable shuffle_tests[i] = shTest_i # save distribution for later N_values_r[shTest_i >= df_obsStat] += 1 # test right # compute p-value p_r = 1.0 * N_values_r / Nsh return p_r, shuffle_tests
def cfd(self): return ngr.bigrams2cfd(self.bigrams)