def test_correlation_graphs(self):
        df_coeffs = correlation_coefficients(self.df,
                                             max_rt_diff=5.0,
                                             coeff_thres=0.7,
                                             pvalue_thres=1.0,
                                             method="pearson",
                                             block=5000,
                                             ncpus=None)
        graph = correlation_graphs(df_coeffs, self.df)

        n0 = list(graph.nodes(data=True))[0]
        n1 = list(graph.nodes(data=True))[-1]

        e0 = list(graph.edges(data=True))[0]
        e1 = list(graph.edges(data=True))[-1]

        # order is different between python 2 and 3
        np.testing.assert_almost_equal(
            [n0[1]["mz"], n0[1]["intensity"], n0[1]["rt"]],
            [168.989654, 520.0, 120.0])
        np.testing.assert_almost_equal(
            [n1[1]["mz"], n1[1]["intensity"], n1[1]["rt"]],
            [493.063765, 163.33, 192.5])
        np.testing.assert_almost_equal([
            e0[2]["rvalue"], e0[2]["pvalue"], e0[2]["mzdiff"], e0[2]["rtdiff"]
        ], [1.0, 0.0, 167.982378, 1.0])
        np.testing.assert_almost_equal([
            e1[2]["rvalue"], e1[2]["pvalue"], e1[2]["mzdiff"], e1[2]["rtdiff"]
        ], [1.0, 0.0, 1.003355, 2.5])
示例#2
0
def group_features(df,
                   db_out,
                   max_rt_diff=5.0,
                   coeff_thres=0.7,
                   pvalue_thres=1.0,
                   method="pearson",
                   block=5000,
                   ncpus=None):

    conn = sqlite3.connect(db_out)
    cursor = conn.cursor()

    cursor.execute("DROP TABLE IF EXISTS groups")

    cursor.execute("""CREATE TABLE groups (
                   group_id INTEGER DEFAULT NULL,
                   peak_id_a TEXT DEFAULT NULL,
                   peak_id_b TEXT DEFAULT NULL,
                   degree_a INTEGER DEFAULT NULL,
                   degree_b INTEGER DEFAULT NULL,
                   r_value REAL DEFAULT NULL,
                   p_value REAL DEFAULT NULL,
                   rt_diff REAL DEFAULT NULL,
                   mz_diff REAL DEFAULT NULL,                 
                   PRIMARY KEY (peak_id_a, peak_id_b));""")

    df_coeffs = statistics.correlation_coefficients(df, max_rt_diff,
                                                    coeff_thres, pvalue_thres,
                                                    method, block, ncpus)
    graph = statistics.correlation_graphs(df_coeffs, df)
    sub_graphs = list(
        graph.subgraph(c) for c in nx.weakly_connected_components(graph))
    for i in range(len(sub_graphs)):
        sub_graphs[i].graph[
            "groupid"] = i + 1  # not stored in output - place holder
        sub_graph_edges = []
        # sort edges
        edges = sorted(sub_graphs[i].edges(data=True),
                       key=lambda e: (e[0], e[1]))
        for edge in edges:
            sub_graph_edges.append(
                (i + 1, str(edge[0]), str(edge[1]),
                 sub_graphs[i].degree(edge[0]), sub_graphs[i].degree(edge[1]),
                 round(float(edge[2]["rvalue"]), 2), float(edge[2]["pvalue"]),
                 float(edge[2]["rtdiff"]), float(edge[2]["mzdiff"])))
        cursor.executemany(
            """insert into groups (group_id, peak_id_a, peak_id_b, degree_a, degree_b,
                              r_value, p_value, rt_diff, mz_diff) values (?,?,?,?,?,?,?,?,?)""",
            sub_graph_edges)
    conn.commit()
    conn.close()
    return graph
    def test_correlation_coefficients(self):
        df_coeffs_comp = pd.DataFrame(
            {
                "name_a":
                ["M169T120", "M169T120", "M337T121", "M215T170", "M492T190"],
                "name_b":
                ["M337T121", "M505T122", "M505T122", "M231T174", "M493T192"],
                "r_value": [
                    np.float64(1.0),
                    np.float64(1.0),
                    np.float64(1.0),
                    np.float64(1.0),
                    np.float64(1.0)
                ],
                "p_value": [
                    np.float64(0.0),
                    np.float64(0.0),
                    np.float64(0.0),
                    np.float64(0.0),
                    np.float64(5.85415087865495e-157)
                ]
            },
            columns=["name_a", "name_b", "r_value", "p_value"])
        df_coeffs = correlation_coefficients(self.df,
                                             max_rt_diff=5.0,
                                             coeff_thres=0.7,
                                             pvalue_thres=1.0,
                                             method="pearson",
                                             block=5000,
                                             ncpus=None)
        pd.testing.assert_frame_equal(df_coeffs, df_coeffs_comp)

        df_coeffs_comp = pd.DataFrame(
            {
                "name_a":
                ["M169T120", "M169T120", "M337T121", "M215T170", "M492T190"],
                "name_b":
                ["M337T121", "M505T122", "M505T122", "M231T174", "M493T192"],
                "r_value": [
                    np.float64(1.0),
                    np.float64(1.0),
                    np.float64(1.0),
                    np.float64(1.0),
                    np.float64(1.0)
                ],
                "p_value": [
                    np.float64(0.0),
                    np.float64(0.0),
                    np.float64(0.0),
                    np.float64(0.0),
                    np.float64(0.0)
                ]
            },
            columns=["name_a", "name_b", "r_value", "p_value"])

        df_coeffs = correlation_coefficients(self.df,
                                             max_rt_diff=5.0,
                                             coeff_thres=0.7,
                                             pvalue_thres=1.0,
                                             method="spearman",
                                             block=5000,
                                             ncpus=None)
        pd.testing.assert_frame_equal(df_coeffs, df_coeffs_comp)

        df_coeffs = correlation_coefficients(self.df,
                                             max_rt_diff=50000.0,
                                             coeff_thres=0.0,
                                             pvalue_thres=1.0,
                                             method="pearson",
                                             block=5000,
                                             ncpus=None)
        self.assertEqual(df_coeffs.shape, (136, 4))