Пример #1
0
def adeSavGolFeatureCorrelation(data, max_corr, country):
    print(
        "Testing Correlation between corresponding L and C AVG statistics for each underlying datatype"
    )
    under_data = ['I2', 'I1', 'GM2', 'GM1', 'FF1', 'MP1', 'MP4', 'MP2']
    correlation_results = pd.DataFrame(columns=under_data,
                                       index=['Pearson', 'MIC'])
    cols_to_remove = []
    for _ud in under_data:
        change_and_level = data[[
            col_name for col_name in data.columns.values
            if col_name.startswith(_ud)
        ]]
        linear_corr = sp.stats.pearsonr(change_and_level.iloc[:, 0],
                                        change_and_level.iloc[:, 1])[0]
        nonlinear_corr = minerva.mine(
            vc.FloatVector(np.asarray(change_and_level.iloc[:, 0])),
            vc.FloatVector(np.asarray(change_and_level.iloc[:, 1])))[0][0]
        correlation_results[_ud] = pd.Series(
            [linear_corr, float(nonlinear_corr)], index=['Pearson', 'MIC'])
        ##removing columns that contain a corellation number of 0.75

        if (linear_corr > max_corr) or (nonlinear_corr > max_corr):
            cols_to_remove = cols_to_remove + ["{}{}".format(_ud, "L")]
    data = data.loc[:, [
        _col for _col in list(data.columns.values)
        if _col not in cols_to_remove
    ]]

    correlation_results.to_excel(
        "../Reserach/AdeSavGol Transform/L&C correlation/{}.xlsx".format(
            country),
        engine="openpyxl")
    print(correlation_results)
    return data
Пример #2
0
def mann_withney_test_r(list_values1, list_values2):
    wicox = robj.r("""

    function(x, y){
        test = wilcox.test(x,y, alternative='two.sided', correct=F)
        return(test$p.value)
    }

                   """)
    pval = float(wicox(v.FloatVector(list_values1), v.FloatVector(list_values2))[0])
    return pval
def pvalue_getter(list1, list2):
    """
    :param list1: list of float
    :param list2: list of float
    :return: a p value
    """
    print(list1)
    print(list2)
    wilcox = robj.r("""
        function(list1, list2){
            return(wilcox.test(list1,list2)$p.value)
        }
        """)
    return float(wilcox(v.FloatVector(list1), v.FloatVector(list2))[0])
def frequency_test(obs1, tot1, obs2, tot2):
    """
    Chiq test

    :param obs1: (int) the count number of an amino acid X in the set of protein 1.
    :param tot1: (int) the total number of amino acids in the set of protein 1.
    :param obs2: (int) the count number of an amino acid X in the set of protein 2.
    :param tot2: (int) the total number of amino acids in the set of protein 2.
    :return: proportion test p-value
    """

    chisq = robj.r("""

        function(vect){
            m<-matrix(vect, byrow=T, nrow=2)
            return(chisq.test(m)$p.value)
        }

                       """)

    rm1 = tot1 - obs1
    rm2 = tot2 - obs2
    vect = v.FloatVector([obs1, rm1, obs2, rm2])
    pval = float(chisq(vect)[0])
    print(obs1, rm1, obs2, rm2)
    print(pval)
    if np.isnan(pval):
        return "NA"
    else:
        return pval
Пример #5
0
def create_statistical_report(list_values, list_name, ctrl_full, filename, nt):
    """
    Create a statistical report.

    :param list_values: (list of list of floats) the list of value that we want to compare to a control list
    :param list_name: (list of string) the name of each sublist of float in ``list_values``
    :param ctrl_full: (list of float) the control list of values
    :param filename: (string) the name of the figure associated with those stat
    :param nt: (string) the nucleotide studied
    """
    if not nt:
        cur_ctrl = np.array(ctrl_full, dtype=float)
    else:
        cur_ctrl = np.array(ctrl_full[nt], dtype=float)
    cur_ctrl = list(cur_ctrl[~np.isnan(cur_ctrl)])
    dic_res = {"Factor": [], "P-value": []}
    for i in range(len(list_values)):
        dic_res["Factor"].append(list_name[i])
        cur_list = np.array(list_values[i], dtype=float)
        cur_list = list(cur_list[~np.isnan(cur_list)])
        # print("          Factor : %s, mean = %s" % (list_name[i], np.nanmean(list_values[i])))

        dic_res["P-value"].append(statistical_analysis.mann_withney_test_r(cur_list, cur_ctrl))
    df = pd.DataFrame(dic_res)
    rstats = robj.packages.importr('stats')
    pcor = rstats.p_adjust(v.FloatVector(dic_res["P-value"]), method="BH")
    df["P-adjusted_BH"] = pcor
    df.to_csv(filename.replace(".html", "wilcox_stat.txt"), sep="\t", index=False)
def adjust_pvalues(pvalues):
    """
    correct a list of pvalues
    :param pvalues: (list of float) list of pvalues
    :return: (list of float) list of pvalues corrected
    """
    rstats = robj.packages.importr('stats')
    pcor = np.array(rstats.p_adjust(v.FloatVector(pvalues), method="BH"))
    return list(pcor)
def mann_withney_test_r(list_values1, list_values2):
    """
    Perform a mann withney wilcoxon test on ``list_values1`` and ``list_values2``.

    :param list_values1: (list of float)  list of float
    :param list_values2: (list of float)  list of float
    :return: (float) the pvalue of the mann withney test done one `list_values1`` and ``list_values2``.
    """
    wicox = robj.r("""

    function(x, y){
        test = wilcox.test(x,y, alternative='two.sided', correct=F)
        return(test$p.value)
    }

                   """)
    pval = float(wicox(v.FloatVector(list_values1), v.FloatVector(list_values2))[0])
    return pval
def mann_withney_test_r(list_values1, list_values2, alt="less"):
    """
    Perform a mann withney wilcoxon test on ``list_values1`` and ``list_values2``.

    :param list_values1: (list of float)  list of float
    :param list_values2: (list of float)  list of float
    :param alt: (string) the alternative hypothesis selected
    :return: (float) the pvalue of the mann withney test done one `list_values1`` and ``list_values2``.
    """
    wicox = robj.r("""

    function(x, y){
        test = wilcox.test(x,y, alternative="%s", correct=F)
        return(test$p.value)
    }

                   """ % alt)
    pval = float(wicox(v.FloatVector(list_values1), v.FloatVector(list_values2))[0])
    return pval
Пример #9
0
def r_ttest(x, y):
    """

    :param x: (list of float value)
    :param y: (list of float value)
    :return: R t test p value
    """
    import rpy2.robjects as robj
    import rpy2.robjects.vectors as v
    ttestmaker = robj.r("""
    function(x,y){
        test = t.test(x,y)
        return(test$p.value)
    }""")
    try:
        pval = ttestmaker(v.FloatVector(x), v.FloatVector(y))
        pval = float(pval[0])
    except:
        pval = float("nan")
    return pval
Пример #10
0
def _to_mccm_vec(x):
    """
    Compose vector into format for multispatialCCM.R

    Args:
        x (np.array): Data vector

    Returns:
        (rvec.FloatVector) Data in R format
    """
    mccm_x = np.array([[np.nan] + list(x) for x in list(x)])
    mccm_x = mccm_x.ravel()

    return rvec.FloatVector(mccm_x)
def adeSavGolCorrelation(country):
    data = bpp.algo.importData(country)
    algos = ['I2', 'I1', 'GM2', 'GM1', 'FF1', 'MP1', 'MP4', 'MP2']
    correlation_results = pd.DataFrame(columns=algos, index=['Pearson', 'MIC'])

    for _algo in algos:
        change_and_level = data[[
            col_name for col_name in data.columns.values
            if col_name.startswith(_algo)
        ]]
        linear_corr = sp.stats.pearsonr(change_and_level.iloc[:, 0],
                                        change_and_level.iloc[:, 1])

        nonlinear_corr = minerva.mine(
            vc.FloatVector(np.asarray(change_and_level.iloc[:, 0])),
            vc.FloatVector(np.asarray(change_and_level.iloc[:, 1])))

        correlation_results[_algo] = pd.Series(
            [linear_corr[0], float(nonlinear_corr[0][0])],
            index=['Pearson', 'MIC'])

    print(correlation_results)
    correlation_results.to_excel(
        "../Reserach/AdeSavGol Transform/L&C_corr.xlsx", engine="openpyxl")
def perform_mann_withney_test(dataframe, sf_name, exon_type):
    """
    From a dataframe of value perform a Mann Withney Wilcoxon test.

    :param dataframe: (pandas DataFrame)
    :param sf_name: (string)
    :param exon_type: (string)
    :return: (pandas dataFrame)
    """
    rstats = importr('stats')
    list_ctrl = np.array(
        dataframe[dataframe["project"] == exon_type].loc[:, "values"].values,
        dtype=float)
    list_ctrl = list(list_ctrl[~np.isnan(list_ctrl)])
    pval_list = []
    for my_sf in sf_name:
        test_list = np.array(
            dataframe[dataframe["project"] == my_sf].loc[:, "values"].values,
            dtype=float)
        test_list = list(test_list[~np.isnan(test_list)])
        pval_list.append(mann_withney_test_r(test_list, list_ctrl))
    pcor = rstats.p_adjust(v.FloatVector(pval_list), method="BH")
    df = pd.DataFrame({"SF": sf_name, "pval_MW": pval_list, "p_adj": pcor})
    return df[["SF", "pval_MW", "p_adj"]]
Пример #13
0
import pytest
from rpy2.robjects import vectors
from rpy2.robjects.packages import importr
from rpy2.ipython import html

base = importr('base')


@pytest.mark.parametrize(
    'o,func', [(vectors.IntVector([1, 2, 3]), html.html_vector_horizontal),
               (vectors.FloatVector([1, 2, 3]), html.html_vector_horizontal),
               (vectors.StrVector(['a', 'b'
                                   'c']), html.html_vector_horizontal),
               (vectors.FactorVector(['a', 'b'
                                      'c']), html.html_vector_horizontal),
               (vectors.ListVector({
                   'a': 1,
                   'b': 2
               }), html.html_rlist),
               (vectors.DataFrame({
                   'a': 1,
                   'b': 'z'
               }), html.html_rdataframe),
               ('x <- c(1, 2, 3)', html.html_sourcecode),
               (base.c, html.html_ridentifiedobject)])
def test_html_func(o, func):
    res = func(o)
    assert isinstance(res, str)
Пример #14
0
def main():
    # Handle input params
    in_fname = sys.argv[1]
    out_fname = sys.argv[2]
    try:
        column = int(sys.argv[3]) - 1
    except Exception:
        sys.exit("Column not specified, your query does not contain a column of numerical data.")
    title = sys.argv[4]
    xlab = sys.argv[5]
    breaks = int(sys.argv[6])
    if breaks == 0:
        breaks = "Sturges"
    if sys.argv[7] == "true":
        density = True
    else:
        density = False
    if len(sys.argv) >= 9 and sys.argv[8] == "true":
        frequency = True
    else:
        frequency = False

    matrix = []
    skipped_lines = 0
    first_invalid_line = 0
    invalid_value = ''
    i = 0
    for i, line in enumerate(open(in_fname)):
        valid = True
        line = line.rstrip('\r\n')
        # Skip comments
        if line and not line.startswith('#'):
            # Extract values and convert to floats
            row = []
            try:
                fields = line.split("\t")
                val = fields[column]
                if val.lower() == "na":
                    row.append(float("nan"))
            except Exception:
                valid = False
                skipped_lines += 1
                if not first_invalid_line:
                    first_invalid_line = i + 1
            else:
                try:
                    row.append(float(val))
                except ValueError:
                    valid = False
                    skipped_lines += 1
                    if not first_invalid_line:
                        first_invalid_line = i + 1
                        invalid_value = fields[column]
        else:
            valid = False
            skipped_lines += 1
            if not first_invalid_line:
                first_invalid_line = i + 1

        if valid:
            matrix.extend(row)

    if skipped_lines < i:
        try:
            grdevices = importr('grDevices')
            graphics = importr('graphics')
            vector = vectors.FloatVector(matrix)
            grdevices.pdf(out_fname, 8, 8)
            histogram = graphics.hist(vector, probability=not frequency, main=title, xlab=xlab, breaks=breaks)
            if density:
                density = r.density(vector)
                if frequency:
                    scale_factor = len(matrix) * (histogram['mids'][1] - histogram['mids'][0])  # uniform bandwidth taken from first 2 midpoints
                    density['y'] = map(lambda x: x * scale_factor, density['y'])
                graphics.lines(density)
            grdevices.dev_off()
        except Exception as exc:
            sys.exit("%s" % str(exc))
    else:
        if i == 0:
            sys.exit("Input dataset is empty.")
        else:
            sys.exit("All values in column %s are non-numeric." % sys.argv[3])

    print("Histogram of column %s. " % sys.argv[3])
    if skipped_lines > 0:
        print("Skipped %d invalid lines starting with line #%d, '%s'." % (skipped_lines, first_invalid_line, invalid_value))
Пример #15
0

def shapiro_test(list_val):
    """
    :param list_val: (list of float) list of frequency for a feature corresponding to a set of exon
    :return: (float) p-value of the shapiro-wilk test
    """

    shapiro_test = rpy2.robjects.r(
        """
        function(list_val){
            return(shapiro.test(list_val)$p.value)
        }
        """
    )
    return shapiro_test(v.FloatVector(list_val))[0]


def comparison_test(list_val1, list_val2, test):
    """

    :param list_val1: (list of float) list of frequency for a feature corresponding to a set of exon
    :param list_val2: (list of float) list of frequency for a feature corresponding to another set of exon
    :param test: (string) the type of test to use
    :return: (float) pvalue of the comparison test
    """
    ttest = rpy2.robjects.r(
        """
        function(list_val1, list_val2){
            return(t.test(list_val1, list_val2)$p.value)
        }