示例#1
0
def network_graphml(
    network_file: "network file",
    geneset_file: "geneset file",
    output_file: "graphml file for network for visualisation",
    setname: "set name" = None,
    giant_component_only:
    "saves only the giant component of the network" = True,
    minimal: 'saves only the minimal graph' = False,
):
    """
    This function generates a graphml file with nodes annotation.
    Given a geneset, with k setnames, each node has k False/True
    annotations for each set.

    Warning: without minimal, this function saves the full network.
    The minimal graph saves only the nodes in the geneset and those that
    connect them with a shortest path.
    """

    network = rc.ReadTsv(network_file).get_network()
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    if giant_component_only:
        network = network.subgraph(
            max(nx.connected_components(network), key=len))

    dict_nodes = {}
    if minimal:
        for setname in geneset:
            new_network_minimal = nx.Graph()
            for s in geneset[setname]:
                if s in network.nodes():
                    l0 = np.inf
                    for t in geneset[setname]:
                        if (t in network.nodes()) & (s != t):
                            path = nx.shortest_path(network,
                                                    source=s,
                                                    target=t)
                            if l0 > len(path):
                                l0 = len(path)
                                new_network_minimal.add_path(path)
            for n in new_network_minimal.nodes():
                if n in geneset[setname]:
                    dict_nodes[n] = True
                else:
                    dict_nodes[n] = False
            nx.set_node_attributes(new_network_minimal, dict_nodes, setname)

    else:
        for setname in geneset:
            for n in network.nodes():
                if n in geneset[setname]:
                    dict_nodes[n] = True
                else:
                    dict_nodes[n] = False
            nx.set_node_attributes(network, dict_nodes, setname)

    nx.write_graphml(network, output_file)
示例#2
0
def test_degree_distribution(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    results_figure: "barplot of results, use pdf or png extension" = None,
    ):
    """
        Performs degree distribution test.
        Kolmogorov-Smirnov statistic on 2 samples.
        H0 is that the geneset is drawn from the same distribution of all the other nodes.
        H0 rejected if statistic is greater.
    """
    network = rc.ReadTsv(network_file)

    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    setnames = [key for key in geneset.keys()]

    output1 = out.Output(
        network_file, output_table, "test_degree", geneset_file, setnames
    )
    logging.info("Results file = " + output1.output_table_results)
    output1.create_st_table_empirical()

    st_test = KS.KSTest(KS.degree_distribution, network)

    for setname, item in geneset.items():

        item = set(item)
        if len(item) > size_cut:
            observed, pvalue, n_mapped, n_geneset = st_test.apply_test(item)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s remove from results since nodes mapped are < %d"
                    % (setname, size_cut)
                )
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))

                output1.update_st_table_empirical(
                    setname, n_mapped, n_geneset, 1, observed, pvalue, 0, 0
                )

    output1.close_temporary_table()
    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results, results_figure, alternative='greater')
示例#3
0
    def __init__(self,
                 gmt_file: str,
                 output_gmt_file: str,
                 conversion: str,
                 entrez_col: str,
                 symbol_col: str,
                 converter_map_filename: str = "entrez_name.tsv"):
        """
        :param gmt_file: the input GMT file path
        :param output_gmt_file: the output GMT file path
        :param conversion: could be "e2s"-> Entrez2Symbols or "s2e" -> Symbol2Entrez
        :param entrez_col: the name of the entrez column
        :param symbol_col: the name of the symbol column
        :param converter_map_filename: the path to the .tsv used to convert the genes name
        """
        super().__init__()
        self.gmt_file = gmt_file
        self.output_gmt_file = output_gmt_file
        self.conversion = conversion
        self.entrez_col = entrez_col
        self.symbol_col = symbol_col
        self.converter_map_filename = converter_map_filename

        self.gmt_data = rc.ReadGmt(self.gmt_file, True).get_data()
        self.tsv_data = rc.ReadTsv(self.converter_map_filename,
                                   pd_table=True).get_data()

        if self.conversion == "e2s":
            for k, d in self.gmt_data.items():
                self.gmt_data[k]["genes"] = super().convert_e2s(
                    d["genes"], self.tsv_data, self.entrez_col,
                    self.symbol_col)
        elif self.conversion == "s2e":
            for k, d in self.gmt_data.items():
                self.gmt_data[k]["genes"] = super().convert_s2e(
                    d["genes"], self.tsv_data, self.entrez_col,
                    self.symbol_col)
        else:
            logging.error("Conversion type not understood")
        super()._gmt_output(self.gmt_data, self.output_gmt_file)
示例#4
0
def network_summary(
        network_file: "network file",
        text_output: "output text file for the summary",
        degree_figure_file: "pdf or png file for the degree distribution",
        c_components_figure_file:
    "pdf or png file for the connected components distribution",
        geneset_input_file: "geneset file" = None,
        setname: "specify a single geneset" = None):
    '''This function saves the principal info of a graph.
    | - network properties \n
    - degree distribution\n
    - connected components diagnostic\n

    If a geneset/setname is passed to the function, the properties of the subgraph are evaluated

    '''

    logging.info("Evaluating network summary, please wait")
    network = rc.ReadTsv(network_file).get_network()

    if geneset_input_file:
        if not setname:
            logging.error('Missing setname name, specify  a unique setname')
        else:
            geneset = rc.ReadGmt(geneset_input_file).get_geneset(setname)
            for setname, item in geneset.items():
                graph = nx.subgraph(network, item)
                out.write_graph_summary(graph, text_output,
                                        setname + " on " + network_file)
                diagnostic.plot_connected_components(
                    nx.connected_components(graph), c_components_figure_file)
                diagnostic.plot_degree(nx.degree(graph), degree_figure_file)

    else:
        out.write_graph_summary(network, text_output, network_file)
        diagnostic.plot_connected_components(nx.connected_components(network),
                                             c_components_figure_file)
        diagnostic.plot_degree(nx.degree(network), degree_figure_file)
    logging.info("Network summary completed")
示例#5
0
def test_topology_total_degree(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None):
    """
    Performs the analysis of total degree of the .

    It computes a p-value for the ratio of total degree of the geneset being bigger than the one expected by chance
    for a geneset of the same size.
    """
    logging.info("Evaluating the test topology total degree, please wait")
    network = rc.ReadTsv(network_file).get_network()
    data = rc.ReadTxt(geneset_file).get_data()
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    setnames = [key for key in geneset.keys()]

    # Generate output
    output1 = out.Output(network_file, output_table, "topology_total_degree",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)

    # Create table
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(st.geneset_total_degree_statistic,
                                 network,
                                 degree_bins=n_bins)

    for setname, item in geneset.items():
        # Geneset smaller than size cut are not taken into consideration
        if len(item) > size_cut:
            item = set(item)
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s removed from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                # TODO Check line below
                logging.info("Null mean: %g null variance: %g".format(
                    np.mean(null_d), np.var(null_d)))
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))
                if diagnostic_null_folder:
                    diagnostic.plot_null_distribution(
                        null_d,
                        observed,
                        diagnostic_null_folder + setname +
                        '_total_degree_null_distribution.pdf',
                        setname=setname)
    output1.close_temporary_table()
    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='greater')
    logging.info("Test topology total degree completed")
示例#6
0
def test_association_rwr(
    network_file: "network file",
    file_geneset_a: "GMT geneset file",
    rwr_matrix_filename: ".hdf5 file with the RWR matrix obtained by pygna",
    output_table: "output results table, use .csv extension",
    setname_a: "Geneset A to analyse" = None,
    file_geneset_b: "GMT geneset file" = None,
    setname_b: "Geneset B to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    keep: "if true, keeps the geneset B unpermuted" = False,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: "set if you want the large matrix to be read in memory" = False,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "heatmap of results" = None,
):
    """
    Performs comparison of network location analysis.

    It computes a p-value for the shortest path distance
    between two genesets being smaller than expected by chance.

    If only A_geneset_file is passed the analysis is run on all the pair of sets in the file, if both
    A_geneset_file and B_geneset_file are passed, one can specify the setnames for both, if there is only one
    geneset in the file, setname_X can be omitted, if both sets are in the same file, B_geneset_file can be not
    specified, but setnames are needed.
    """

    if keep:
        analysis_name_str = "association_rwr"
    else:
        analysis_name_str = "comparison_rwr"

    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))

    # Read datasets
    if setname_a and setname_b is None and file_geneset_b is None:
        logging.error(" this analysis requires at least two genesets ")

    rw_dict = {
        "nodes": read_distance_matrix(rwr_matrix_filename,
                                      in_memory=in_memory)[0],
        "matrix": read_distance_matrix(rwr_matrix_filename,
                                       in_memory=in_memory)[1]
    }

    geneset_a = rc.ReadGmt(file_geneset_a).get_geneset(setname_a)
    if file_geneset_b:
        geneset_b = rc.ReadGmt(file_geneset_b).get_geneset(setname_b)
    else:
        if setname_b:
            geneset_b = rc.ReadGmt(file_geneset_a).get_geneset(setname_b)
        else:
            geneset_b = None

    st_comparison = sc.StatisticalComparison(sc.comparison_random_walk,
                                             network,
                                             n_proc=cores,
                                             diz=rw_dict,
                                             degree_bins=n_bins)
    if not geneset_b:
        logging.info("Analysing all the sets in " + file_geneset_a)
        setnames = [key for key in geneset_a.keys()]
        output1 = out.Output(network_file, output_table, analysis_name_str,
                             file_geneset_a, setnames)
        logging.info("Results file = " + output1.output_table_results)
        output1.create_comparison_table_empirical()

        for pair in itertools.combinations(setnames, 2):
            if len(set(geneset_a[pair[0]])) > size_cut and len(
                    set(geneset_a[pair[1]])) > size_cut:
                logging.info("Analysing " + str(pair[0]) + " and " +
                             str(pair[1]))
                n_overlaps = len(
                    set(geneset_a[pair[0]]).intersection(
                        set(geneset_a[pair[1]])))
                observed, pvalue, null_d, a_mapped, b_mapped = st_comparison.comparison_empirical_pvalue(
                    set(geneset_a[pair[0]]),
                    set(geneset_a[pair[1]]),
                    max_iter=number_of_permutations,
                    alternative="greater",
                    keep=keep)

                output1.update_comparison_table_empirical(
                    pair[0], pair[1], len(set(geneset_a[pair[0]])), a_mapped,
                    len(set(geneset_a[pair[1]])), b_mapped, n_overlaps,
                    number_of_permutations, observed, pvalue, np.mean(null_d),
                    np.var(null_d))
            else:
                logging.warning(
                    "Geneset A has %d terms and Geneset B has %d terms. \
                \nOne of them is too short, analysis not done" %
                    (len(set(geneset_a[pair[0]])), len(set(
                        geneset_a[pair[1]]))))

    else:
        logging.info("geneset_a contains %d sets" % (len(geneset_a)))
        sets_a = [key for key in geneset_a.keys()]
        logging.info("Setnames in A: " + str(sets_a))
        logging.info("geneset_b contains %d sets" % (len(geneset_b)))
        sets_b = [key for key in geneset_b.keys()]
        logging.info("Setnames in B: " + str(sets_b))
        output1 = out.Output(network_file, output_table, analysis_name_str,
                             file_geneset_a, sets_a, file_geneset_b, sets_b)
        logging.info("Results file = " + output1.output_table_results)
        output1.create_comparison_table_empirical()

        for set_A, item_A in geneset_a.items():
            for set_B, item_B in geneset_b.items():

                if len(item_A) > size_cut and len(item_B) > size_cut:
                    logging.info("Analysing " + str(set_A) + " and " +
                                 str(set_B))
                    n_overlaps = len(set(item_A).intersection(set(item_B)))
                    observed, pvalue, null_d, a_mapped, b_mapped = st_comparison.comparison_empirical_pvalue(
                        set(item_A),
                        set(item_B),
                        max_iter=number_of_permutations,
                        alternative="greater",
                        keep=keep)
                    logging.info("Observed: %g p-value: %g" %
                                 (observed, pvalue))

                    output1.update_comparison_table_empirical(
                        set_A, set_B, len(set(item_A)), a_mapped,
                        len(set(item_B)), b_mapped, n_overlaps,
                        number_of_permutations, observed, pvalue,
                        np.mean(null_d), np.var(null_d))
                else:
                    logging.warning(
                        "Geneset A has %d terms and Geneset B has %d terms. \
                    \nOne of them is too short, analysis not done" %
                        (len(set(item_A)), len(set(item_B))))

    output1.close_temporary_table()
    if results_figure:
        paint.paint_comparison_matrix(output1.output_table_results,
                                      results_figure,
                                      rwr=True)
示例#7
0
def test_association_sp(
    network_file: "network file",
    file_geneset_a:
    "GMT geneset file, if it's the only parameter passed the analysis is gonna be run on all the "
    "pair of datasets, otherwise specify the other files and setnames",
    distance_matrix_filename: "distance matrix file generated by pygna",
    output_table: "output results table, use .csv extension",
    setname_a: "Geneset A to analyse" = None,
    file_geneset_b: "GMT geneset file" = None,
    setname_b: "Geneset B to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    keep: "if true, keeps the geneset B not permuted" = False,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: "set if you want the large matrix to be read in memory" = False,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "barplot of results, use pdf or png extension" = None,
):
    """
    Performs comparison of network location analysis. If the flag
    --keep  is passed, the B geneset is kept fixed, and doesnt't get permuted.

    It computes a p-value for the shortest path distance between two genesets being smaller than expected by chance
    If only A_geneset_file is passed the analysis is run on all the pair of sets in the file, if both
    A_geneset_file and B_geneset_file are passed, one can specify the setnames for both, if there is only one
    geneset in the file, setname_X can be omitted, if both sets are in the same file, B_geneset_file can be not
    specified, but setnames are needed.
    """

    if keep:
        analysis_name_str = "association_sp"
    else:
        analysis_name_str = "comparison_sp"

    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))

    # Read matrix
    sp_diz = {
        "nodes":
        read_distance_matrix(distance_matrix_filename, in_memory=in_memory)[0],
        "matrix":
        read_distance_matrix(distance_matrix_filename, in_memory=in_memory)[1]
    }

    sp_diz["matrix"] = sp_diz["matrix"] + np.transpose(sp_diz["matrix"])
    np.fill_diagonal(sp_diz["matrix"], np.inf)

    # Managing the different genesets
    if setname_a and setname_b is None and file_geneset_b is None:
        logging.error(" this analysis requires at least two genesets ")

    geneset_a = rc.ReadGmt(file_geneset_a).get_geneset(setname_a)
    if file_geneset_b:
        geneset_b = rc.ReadGmt(file_geneset_b).get_geneset(setname_b)
    else:
        if setname_b:
            geneset_b = rc.ReadGmt(file_geneset_a).get_geneset(setname_b)
        else:
            geneset_b = None

    st_comparison = sc.StatisticalComparison(sc.comparison_shortest_path,
                                             network,
                                             diz=sp_diz,
                                             n_proc=cores,
                                             degree_bins=n_bins)

    if not geneset_b:  # Analysis of genesets inside a single file
        logging.info("Analysing all the sets in " + file_geneset_a)
        setnames = [key for key in geneset_a.keys()]

        # Creating the output table
        output1 = out.Output(network_file, output_table, analysis_name_str,
                             file_geneset_a, setnames)
        logging.info("Results file = " + output1.output_table_results)
        output1.create_comparison_table_empirical()

        for pair in itertools.combinations(setnames, 2):
            if len(set(geneset_a[pair[0]])) > size_cut and len(
                    set(geneset_a[pair[1]])) > size_cut:
                logging.info("Analysing " + str(pair[0]) + " and " +
                             str(pair[1]))

                n_overlaps = len(
                    set(geneset_a[pair[0]]).intersection(
                        set(geneset_a[pair[1]])))
                observed, pvalue, null_d, a_mapped, b_mapped = st_comparison.comparison_empirical_pvalue(
                    set(geneset_a[pair[0]]),
                    set(geneset_a[pair[1]]),
                    max_iter=number_of_permutations,
                    keep=keep)
                # Save the results

                output1.update_comparison_table_empirical(
                    pair[0], pair[1], len(set(geneset_a[pair[0]])), a_mapped,
                    len(set(geneset_a[pair[1]])), b_mapped, n_overlaps,
                    number_of_permutations, observed, pvalue, np.mean(null_d),
                    np.var(null_d))
            else:
                logging.warning(
                    "Geneset A has %d terms and Geneset B has %d terms. \
                \nOne of them is too short, analysis not done" %
                    (len(set(geneset_a[pair[0]])), len(set(
                        geneset_a[pair[1]]))))

    else:  # Analysis of genesets into two different GMT files

        logging.info("geneset_a contains %d sets", (len(geneset_a)))
        sets_a = [key for key in geneset_a.keys()]
        logging.info("geneset_b contains %d sets", (len(geneset_b)))
        sets_b = [key for key in geneset_b.keys()]
        output1 = out.Output(network_file, output_table, analysis_name_str,
                             file_geneset_a, sets_a, file_geneset_b, sets_b)
        logging.info("Results file = " + output1.output_table_results)
        output1.create_comparison_table_empirical()
        for set_A, item_A in geneset_a.items():
            for set_B, item_B in geneset_b.items():
                n_overlaps = len(set(item_A).intersection(set(item_B)))
                if len(item_A) > size_cut and len(item_B) > size_cut:
                    observed, pvalue, null_d, a_mapped, b_mapped = st_comparison.comparison_empirical_pvalue(
                        set(item_A),
                        set(item_B),
                        max_iter=number_of_permutations,
                        keep=keep)

                    logging.info("Observed: %g p-value: %g" %
                                 (observed, pvalue))
                    output1.update_comparison_table_empirical(
                        set_A, set_B, len(set(item_A)), a_mapped,
                        len(set(item_B)), b_mapped, n_overlaps,
                        number_of_permutations, observed, pvalue,
                        np.mean(null_d), np.var(null_d))
    output1.close_temporary_table()
    if results_figure:
        paint.paint_comparison_matrix(output1.output_table_results,
                                      results_figure)
示例#8
0
def test_topology_sp(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    distance_matrix_filename: "distance hdf5 matrix file generated by pygna",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: "set if you want the large matrix to be read in memory" = False,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None,
):
    """
    Performs geneset network topology shortest path analysis.

    It computes a p-value for the average shortest path length
    of the geneset being smaller than expected by chance
    for a geneset of the same size.
    """

    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))

    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    diz = {
        "nodes":
        read_distance_matrix(distance_matrix_filename, in_memory=in_memory)[0],
        "matrix":
        read_distance_matrix(distance_matrix_filename, in_memory=in_memory)[1]
    }
    diz["matrix"] = diz["matrix"] + np.transpose(diz["matrix"])
    np.fill_diagonal(diz["matrix"], float("inf"))
    setnames = [key for key in geneset.keys()]

    output1 = out.Output(network_file, output_table, "topology_sp",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(st.geneset_localisation_statistic,
                                 network,
                                 diz,
                                 degree_bins=n_bins)

    for setname, item in geneset.items():

        item = set(item)
        if len(item) > size_cut:
            logging.info("Setname:" + setname)
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item, cores=cores, max_iter=number_of_permutations)
            logging.info("Observed: %g p-value: %g" % (observed, pvalue))

            output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                              number_of_permutations, observed,
                                              pvalue, np.mean(null_d),
                                              np.var(null_d))
            if diagnostic_null_folder:
                diagnostic.plot_null_distribution(null_d,
                                                  observed,
                                                  diagnostic_null_folder +
                                                  setname +
                                                  '_sp_null_distribution.pdf',
                                                  setname=setname,
                                                  alternative="less")
        else:
            logging.info("%s remove from results since nodes mapped are < %d" %
                         (setname, size_cut))
    output1.close_temporary_table()
    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='less')
示例#9
0
def test_topology_module(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    output_lcc:
    "for creating a GMT file with the LCC lists pass a GMT filename" = None,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None,
):
    """
    Performs geneset network topology module analysis.
    It computes a p-value for the largest connected component of the geneset being bigger than the one expected by chance
    for a geneset of the same size.
    """
    network = rc.ReadTsv(network_file).get_network()
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)

    setnames = [key for key in geneset.keys()]
    output1 = out.Output(network_file, output_table, "topology_module",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)
    output1.create_st_table_empirical()

    st_test = st.StatisticalTest(st.geneset_module_statistic,
                                 network,
                                 degree_bins=n_bins)
    for setname, item in geneset.items():
        item = set(item)
        if len(item) > size_cut:
            if output_lcc:
                module = nx.subgraph(network, item)
                if len(module.nodes) > 0:
                    lcc = sorted(list(nx.connected_components(module)),
                                 key=len,
                                 reverse=True)[0]
                else:
                    lcc = []
                output1.add_GMT_entry(setname, "topology_module", lcc)

            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s remove from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))
                if diagnostic_null_folder:
                    diagnostic.plot_null_distribution(
                        null_d,
                        observed,
                        diagnostic_null_folder + setname +
                        '_module_null_distribution.pdf',
                        setname=setname)
    output1.close_temporary_table()
    if output_lcc:
        output1.create_GMT_output(output_lcc)

    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='greater')
示例#10
0
def test_topology_rwr(
    network_file: "network file, use a network with weights",
    geneset_file: "GMT geneset file",
    rwr_matrix_filename: "hdf5 RWR matrix obtained with pygna ",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: "set if you want the large matrix to be read in memory" = False,
    n_bins:
    'if >1 applies degree correction by binning the node degrees and sampling according to geneset distribution' = 1,
    results_figure: "barplot of results, use pdf or png extension" = None,
    diagnostic_null_folder:
    "plot null distribution, pass the folder where all the figures are going to be saved "
    "(one for each dataset)" = None,
):
    """
    Performs the analysis of random walk probabilities.
    Given the RWR matrix, it compares the probability of walking between the genes in the geneset compared to
    those of walking between the nodes of a geneset with the same size
    """

    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)
    rw_dict = {
        "nodes": read_distance_matrix(rwr_matrix_filename,
                                      in_memory=in_memory)[0],
        "matrix": read_distance_matrix(rwr_matrix_filename,
                                       in_memory=in_memory)[1]
    }

    setnames = [key for key in geneset.keys()]
    output1 = out.Output(network_file, output_table, "topology_rwr",
                         geneset_file, setnames)

    logging.info("Results file = " + output1.output_table_results)
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(st.geneset_RW_statistic,
                                 network,
                                 rw_dict,
                                 degree_bins=n_bins)

    for setname, item in geneset.items():
        item = set(item)
        if len(item) > size_cut:
            # test
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s remove from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                if diagnostic_null_folder:
                    diagnostic.plot_null_distribution(
                        null_d,
                        observed,
                        diagnostic_null_folder + setname +
                        '_rwr_null_distribution.pdf',
                        setname=setname)
                # saving output
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))
        else:
            logging.info(
                "%s removed from results since nodes mapped are < %d" %
                (setname, size_cut))

    output1.close_temporary_table()
    if results_figure:
        paint.paint_datasets_stats(output1.output_table_results,
                                   results_figure,
                                   alternative='greater')
示例#11
0
def get_connected_components(
        network_file: "network tsv file",
        output_gmt: "The output file name (should be gmt)",
        name: 'pass a name for the putput gmt terms',
        geneset_file:
    "GMT of the geneset file, is a file is passed please add the setname" = None,
        setname: "The setname to analyse" = None,
        graphml:
    "Pass a graphml filename to show the results on Cytoscape" = None,
        threshold: 'ignores all CC smaller than this value' = 1,
        convert_entrez: "pass flag to convert EntrezID->Symbol" = False):
    """
    This function evaluate all the connected components in the subgraph pf the network with a given setname.
    Multiple setnames can be passed to this function to analyze all of them in a run.
    The file produces a GMT output and optionally a plot of the subnetwork with the connected components analysed.
    Please notice that to convert the entrezID into Symbols, a stable internet connection is required
    """

    network = rc.ReadTsv(network_file).get_network()

    if type(geneset_file) == str:
        if setname == None:
            geneset = rc.ReadGmt(geneset_file).get_geneset()
            logging.error('using only first entry of the gmt: %s' %
                          (list(geneset.keys())[0]))
            geneset = geneset[list(geneset.keys())[0]]
        else:
            geneset = rc.ReadGmt(geneset_file).get_geneset(setname)[setname]
        network = nx.subgraph(network, list(set(geneset)))

    print(network)

    output1 = out.Output(network_file, 'o.csv', "network_gmt", geneset_file,
                         setname)
    output1.create_st_table_empirical()
    cclist = list()

    mg = mygene.MyGeneInfo()

    connected_components = nx.connected_components(network)
    i = 0
    for cc in connected_components:
        print(cc)
        if len(cc) > threshold:
            i = i + 1
            if convert_entrez:
                cc = mg.querymany(list(cc),
                                  scopes='entrezgene',
                                  fields='symbol',
                                  species='human')
                gene_list = list()
                [gene_list.append(e["symbol"]) for e in cc]
                cc = gene_list
            cclist.append(cc)

            nodes = {}
            for node in cc:
                nodes[node] = i
            output1.add_GMT_entry(name + "_" + str(i), "connected components",
                                  cc)

    output1.create_GMT_output(output_gmt)

    if len(network.nodes()) > 1000:
        logging.info(
            'There are more than 100 nodes in the network, the graphml file might be very large.'
        )

    nx.write_graphml(network, graphml)
示例#12
0
def plot_adjacency(
    network: "network_filename",
    output_file: 'use png or pdf for output figure',
    clusters_file: "file of clusters to order the nodes" = None,
    size: "number of genes to plot, allows to cut the adjacency matrix" = None,
):
    """
    This function plots the adjacency matrix of a network. If a geneset file is passed, the matrix is organised by the different sets in the geneset.
    For the moment the genelist needs to be complete and non overlapping.
    This function has been mostly used for the generation of plots in the paper.
    """

    graph = rc.ReadTsv(network).get_data()
    if len(graph.nodes) > 1000:
        logging.warning(
            "Graph is larger than 1k nodes, plotting might take too long")

    nodelist = None
    s = 0
    nodelabels = []
    if clusters_file:
        geneset = rc.ReadGmt(clusters_file).get_data()
        nodelist = [k for i, v in geneset.items() for k in v]
        for i, v in geneset.items():
            s += 1
            nodelabels.append([s] * len(v))
    nodelabels = [i for j in nodelabels for i in j]
    nodelabels = np.asarray(nodelabels)[np.newaxis, :]

    matrix = nx.adjacency_matrix(graph, nodelist=nodelist).toarray()

    if size:
        matrix = matrix[0:int(size), 0:int(size)]
        nodelabels = nodelabels[:, 0:int(size)]

    # Set font size
    sns.set(font_scale=2)
    if clusters_file:
        f, axes = plt.subplots(
            2,
            2,
            figsize=(10, 10),
            gridspec_kw={
                "width_ratios": [1, 100],
                "height_ratios": [1, 100],
                "wspace": 0.005,
                "hspace": 0.005,
            },
        )
        sns.heatmap(
            [[0, 0], [0, 0]],
            cmap="Greys",
            vmin=0,
            vmax=1,
            square=True,
            ax=axes[0, 0],
            cbar=False,
            xticklabels=False,
            yticklabels=False,
        )
        sns.heatmap(
            matrix,
            cmap="Greys",
            vmin=0,
            vmax=1,
            square=True,
            ax=axes[1, 1],
            cbar=False,
            xticklabels=False,
            yticklabels=False,
        )

        count = 0
        for i, v in geneset.items():
            if size and count < int(size):
                axes[0, 1].annotate(i, xy=(count, 0), xytext=(count, 0))
            elif size and count >= int(size):
                pass
            else:
                axes[0, 1].annotate(i, xy=(count, 0), xytext=(count, 0))
            count = count + len(v)

        g = sns.heatmap(
            nodelabels,
            xticklabels=False,
            yticklabels=False,
            vmin=1,
            vmax=len(geneset.keys()),
            square=False,
            cbar=False,
            linewidths=0,
            cmap="Set3",
            ax=axes[0, 1],
        )
        g = sns.heatmap(
            nodelabels.T,
            xticklabels=False,
            yticklabels=False,
            vmin=1,
            vmax=len(geneset.keys()),
            square=False,
            cbar=False,
            linewidths=0,
            cmap="Set3",
            ax=axes[1, 0],
        )
    else:
        f, axes = plt.subplots(1, 1, figsize=(10, 10))
        sns.heatmap(
            matrix,
            cmap="Greys",
            vmin=0,
            vmax=1,
            square=True,
            ax=axes[1, 1],
            cbar=False,
            xticklabels=False,
            yticklabels=False,
        )

    if output_file.endswith('.pdf'):
        plt.savefig(output_file, format="pdf")
    elif output_file.endswith('.png'):
        plt.savefig(output_file, format="png")
    else:
        logging.warning(
            'The null distribution figure can only be saved in pdf or png, forced to png'
        )
        f.savefig(output_file + '.png', format="png")
示例#13
0
def test_topology_centrality(
    network_file: "network file",
    geneset_file: "GMT geneset file",
    distance_matrix_filename: "The matrix with the SP for each node",
    output_table: "output results table, use .csv extension",
    setname: "Geneset to analyse" = None,
    size_cut: "removes all genesets with a mapped length < size_cut" = 20,
    number_of_permutations:
    "number of permutations for computing the empirical pvalue" = 500,
    cores: "Number of cores for the multiprocessing" = 1,
    in_memory: 'load hdf5 data onto memory' = False,
):
    """
    This function calculates the average closeness centrality of a geneset.
    For a single node, the closeness centrality is defined as the inverse
    of the shortest path distance of the node from all the other nodes.

    """

    logging.info("Evaluating the test topology total degree, please wait")
    network = rc.ReadTsv(network_file).get_network()
    network = nx.Graph(
        network.subgraph(max(nx.connected_components(network), key=len)))
    geneset = rc.ReadGmt(geneset_file).get_geneset(setname)
    setnames = [key for key in geneset.keys()]

    diz = {
        "nodes":
        cmd.read_distance_matrix(distance_matrix_filename,
                                 in_memory=in_memory)[0],
        "matrix":
        cmd.read_distance_matrix(distance_matrix_filename,
                                 in_memory=in_memory)[1]
    }
    diz["matrix"] = diz["matrix"] + np.transpose(diz["matrix"])

    np.fill_diagonal(diz["matrix"], float(0))

    diz['vector'] = np.sum(diz["matrix"], axis=0)

    # Generate output
    output1 = out.Output(network_file, output_table, "topology_centrality",
                         geneset_file, setnames)
    logging.info("Results file = " + output1.output_table_results)
    # Create table
    output1.create_st_table_empirical()
    st_test = st.StatisticalTest(average_closeness_centrality, network, diz)

    for setname, item in geneset.items():
        # Geneset smaller than size cut are not taken into consideration
        if len(item) > size_cut:
            item = set(item)
            observed, pvalue, null_d, n_mapped, n_geneset = st_test.empirical_pvalue(
                item,
                max_iter=number_of_permutations,
                alternative="greater",
                cores=cores)
            logging.info("Setname:" + setname)
            if n_mapped < size_cut:
                logging.info(
                    "%s removed from results since nodes mapped are < %d" %
                    (setname, size_cut))
            else:
                logging.info("Observed: %g p-value: %g" % (observed, pvalue))
                output1.update_st_table_empirical(setname, n_mapped, n_geneset,
                                                  number_of_permutations,
                                                  observed, pvalue,
                                                  np.mean(null_d),
                                                  np.var(null_d))

    output1.close_temporary_table()

    logging.info("Test topology CENTRALITY completed")