예제 #1
0
def randomisation_test4bigrmas_inSequences(seqOfSeqs,
                                           df_obsStat,
                                           Nsh,
                                           condsLi,
                                           sampsLi,
                                           testStat=teStat_proportions_diff):
    """DEPRECATED: use randtest4bigrmas_inSequences
    """

    ## define array to slice superSequences back into the sequences
    seq_slicer = superSequenceSlicer(
        seqOfSeqs)  #np.cumsum(np.array([len(item) for item in seqOfSeqs]))

    ## define super sequence vector
    superSequence = np.array(flattenList(seqOfSeqs))

    ## randomisations test
    nr, nc = np.shape(df_obsStat)
    shuffle_tests = np.zeros((Nsh, nr, nc))
    N_values_r = np.zeros_like(df_obsStat)

    for i in np.arange(Nsh):
        ## randomise supersequence
        np.random.shuffle(superSequence)

        ## define sequences: slice supersequence and put in str format for nltk
        sequences_str = aa.seqsLi2iniEndSeq(
            sliceBackSuperSequence(superSequence, seq_slicer))
        ## split sequences into bigrams
        my_bigrams = list(nltk.bigrams(sequences_str))
        ## count bigrams
        cfd_sh0 = ngr.bigrams2cfd(my_bigrams)
        ## fill cfd_sh0 with empty valued keys of the missing values
        cfd_sh = ngr.fill2KyDict(cfd_sh0, kySet=set(sampsLi) | set(condsLi))
        ## transform cfd into matrix form
        Mp_sh = ngr.condFreqDict2condProbMatrix(
            cfd_sh, condsLi, sampsLi)[0]  # normalised matrix
        shTest_i = testStat(Mp_sh)  # compute satat variable
        shuffle_tests[i] = shTest_i  # save distribution for later
        N_values_r[shTest_i >= df_obsStat] += 1  # test right

    # compute p-value
    p_r = 1.0 * N_values_r / Nsh

    return p_r, shuffle_tests
예제 #2
0
 def cfd(self):
     return ngr.bigrams2cfd(self.bigrams)