Пример #1
0
def randomisation_test4bigrmas(
    df_dict,
    Dtint,
    obsStat,
    Nsh,
    condsLi,
    sampsLi,
    label="call",
    time_param="ici",
    testStat=teStat_proportions_diff,
):
    """one sided randomisation test for each bigram conditional probability
        under the null hypothesis H0: testStat_observed < testStat_shuffled
        returns the p-values
    Parameters
    ----------
    df_dict : dict
        dictionary of dataframes (tapes)
    Dt : tuple
        (None, Dt)
    obsStat : ndarray
        observed stat for each bigram
    Nsh : int
    condLi, sampLi : list
        list of conditions and samples
    testStat : callable
    Returns
    -------
    p_values : ndarray
    shuffle_test : ndarray
        shuffled test distributions
    """
    nr, nc = np.shape(obsStat)
    shuffle_tests = np.zeros((Nsh, nr, nc))
    N_values_r = np.zeros_like(obsStat)
    for i in range(Nsh):  # shuffle ith-loop
        cfd_sh = nltk.ConditionalFreqDist()  # initialise cond freq dist.
        for t in df_dict.keys():  # for each tape
            thisdf = df_dict[t]
            cfd_sh += shuffled_cfd(thisdf,
                                   Dtint,
                                   label=label,
                                   time_param=time_param)  # counts
        Mp_sh, samps, conds = ngr.condFreqDict2condProbMatrix(
            cfd_sh, condsLi, sampsLi)  # normalised matrix
        shTest_i = testStat(Mp_sh)  # compute satat variable
        shuffle_tests[i] = shTest_i  # save distribution for later
        N_values_r[shTest_i >= obsStat] += 1  # test right
        # N_values_l[shTest_i < obsStat] += 1 # test left
    p_r = 1.0 * N_values_r / Nsh
    return p_r, shuffle_tests
Пример #2
0
def randomisation_test4bigrmas_inSequences(seqOfSeqs,
                                           df_obsStat,
                                           Nsh,
                                           condsLi,
                                           sampsLi,
                                           testStat=teStat_proportions_diff):
    """DEPRECATED: use randtest4bigrmas_inSequences
    """

    ## define array to slice superSequences back into the sequences
    seq_slicer = superSequenceSlicer(
        seqOfSeqs)  #np.cumsum(np.array([len(item) for item in seqOfSeqs]))

    ## define super sequence vector
    superSequence = np.array(flattenList(seqOfSeqs))

    ## randomisations test
    nr, nc = np.shape(df_obsStat)
    shuffle_tests = np.zeros((Nsh, nr, nc))
    N_values_r = np.zeros_like(df_obsStat)

    for i in np.arange(Nsh):
        ## randomise supersequence
        np.random.shuffle(superSequence)

        ## define sequences: slice supersequence and put in str format for nltk
        sequences_str = aa.seqsLi2iniEndSeq(
            sliceBackSuperSequence(superSequence, seq_slicer))
        ## split sequences into bigrams
        my_bigrams = list(nltk.bigrams(sequences_str))
        ## count bigrams
        cfd_sh0 = ngr.bigrams2cfd(my_bigrams)
        ## fill cfd_sh0 with empty valued keys of the missing values
        cfd_sh = ngr.fill2KyDict(cfd_sh0, kySet=set(sampsLi) | set(condsLi))
        ## transform cfd into matrix form
        Mp_sh = ngr.condFreqDict2condProbMatrix(
            cfd_sh, condsLi, sampsLi)[0]  # normalised matrix
        shTest_i = testStat(Mp_sh)  # compute satat variable
        shuffle_tests[i] = shTest_i  # save distribution for later
        N_values_r[shTest_i >= df_obsStat] += 1  # test right

    # compute p-value
    p_r = 1.0 * N_values_r / Nsh

    return p_r, shuffle_tests