예제 #1
0
def permute_w_replacement(frame, axis=0):
    '''
    Permute the frame values across the given axis.
    Create simulated dataset were the counts of each component (column)
    in each sample (row), are randomly sampled from the all the 
    counts of that component in all samples.
    
    Parameters
    ----------
    frame : DataFrame
        Frame to permute.
    axis : {0, 1}
        - 0 - Permute row values across columns
        - 1 - Permute column values across rows    
    
    Returns
    -------
    Permuted DataFrame (new instance).
    '''
    from numpy.random import randint 
    axis = 1-_get_axis(axis)
    s = frame.shape[axis]
    fun = lambda x: x.values[randint(0,s,(1,s))][0]
    perm = frame.apply(fun, axis=axis)
    return perm
예제 #2
0
def alr(frame, ref=None, axis=0):
    '''
    Compute the additive log-ratio (alr) transformation 
    with respect to the component given in ref.
    
    Parameters
    ----------
    frame : DataFrame
        Frame to be transformed
    ref : valid label | None
        Label of component to be used as the normalization reference.
        i.e.  values of other component will be divided by values of 
        this reference component.
        IF None is passed (default), the last col/row is used as ref.
    axis : {0, 1}
        0 : transform each row (default)
        1 : transform each colum
    '''
    if not isinstance(frame, DF):
        return alr_for_array(frame, ref, axis)
    axis = _get_axis(axis)
    if ref is None:
        label = frame._get_axis(1 - axis)[-1]
    else:
        label = ref
    if axis == 0:
        norm = 1. * frame[label]
    elif axis == 1:
        norm = 1. * frame.xs(label)
    temp = frame.apply(lambda x: log(x / norm), axis=axis)
    return temp.drop(label, 1 - axis)
예제 #3
0
def alr_for_array(frame, ref=None, axis=0):
    axis = _get_axis(axis)
    if ref is None:
        label = -1
    else:
        label = ref
    if axis == 0:
        norm = 1. * frame[:, label]
    elif axis == 1:
        norm = 1. * frame[label, :]
    temp = np.apply_along_axis(lambda x: log(x / norm), axis, frame)
    return np.delete(temp, label, 1 - axis)
예제 #4
0
def correlation(frame, method='pearson', axis=0):
    '''
    Calculate the correlation between all rows/cols.
    Return frames of correlation values and p-values.
    
    Parameters
    ----------
    frame : DataFrame
        Frame containing data.
    method : {pearson (default) | spearman | kendall}
        Type of correlations to be computed
    axis : {0, 1}
        - 0 - Compute correlation between columns
        - 1 - Compute correlation between rows
    
    Returns
    -------
    c : frame
        DataFrame of symmetric pairwise correlation coefficients.
        Labels are the rows/column labels of the input frame.
    p : frame
        DataFrame of p-values associated with correlation values.
        Labels are the rows/column labels of the input frame.
    ''' 
    import scipy.stats as stats
    axis = _get_axis(axis)
    method = method.lower()
    if method not in set(['pearson', 'kendall', 'spearman']): 
        raise ValueError('Correlation of method %s is not supported.' %method)
    if method == 'spearman' : 
        c_mat, p_mat = stats.spearmanr(frame.values, axis=axis)
        if not np.shape(c_mat):
            c_mat = np.array([[1, c_mat],[c_mat,1]])
            p_mat = np.array([[1, p_mat],[p_mat,1]])
        labels = frame._get_axis(1-axis)
        c = DF(c_mat, index=labels, columns=labels)
        p = DF(p_mat, index=labels, columns=labels)
    else:
        if   method == 'pearson': corr_fun = stats.pearsonr
        elif method == 'kendall': corr_fun = stats.kendalltau
        if   axis == 0: data = frame.T
        elif axis == 1: data = frame
        mat = data.values
        row_labels = data.index
        n = len(row_labels)
        c_mat = np.zeros((n, n))
        p_mat = np.zeros((n, n))
        for i in range(n):
            for j in range(i, n):
                if i == j: 
                    c_mat[i][i] = 1
                    p_mat[i][i] = 1
                    continue
                c_temp, p_temp = corr_fun(mat[i, :], mat[j, :])
                c_mat[i][j] = c_temp
                c_mat[j][i] = c_temp
                p_mat[i][j] = p_temp
                p_mat[j][i] = p_temp
        c = DF(c_mat, index=row_labels, columns=row_labels)
        p = DF(p_mat, index=row_labels, columns=row_labels)
    return c, p