예제 #1
0
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)

                
        logging.info("Loading drugs...")
        self.drug_to_targets = load_drug_targets(params["drug_targets_path"])
예제 #2
0
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True)

        # Log title 
        logging.info("Disease Protein Prediction")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        
        logging.info("Loading Disease Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 

        self.params["method_params"]["dir"] = dir
        self.method = globals()[self.params["method_class"]](self.network, 
                                                             self.diseases_dict, 
                                                             self.params["method_params"])
예제 #3
0
    def __init__(self, dir, params):
        """
        """
        super().__init__(dir, params)
        
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)
        
        logging.info("Loading enrichment study...")
        geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606])
        obodag = GODag("data/go/go-basic.obo")
        self.go_study = GOEnrichmentStudy(self.network.get_names(),
                                          geneid2go,
                                          obodag, 
                                          propagate_counts = True,
                                          alpha = 0.05,
                                          methods = ['fdr_bh'])
예제 #4
0
    def __init__(self, dir, params):
        """ Initialize the disease protein prediction experiment 
        Args: 
            dir (string) The directory where the experiment should be run
        """
        super().__init__(dir, params)

        # set the logger
        set_logger(os.path.join(dir, 'experiment.log'),
                   level=logging.INFO,
                   console=True)

        # log Title
        logging.info("Node set expansion evaluation")
        logging.info(
            "Sabri Eyuboglu, Marinka Zitnik and Jure Leskovec  -- SNAP Group")
        logging.info("======================================")

        # load data from params file
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"],
                               remove_nodes=self.params.get("remove_nodes", 0),
                               remove_edges=self.params.get("remove_edges", 0))
        logging.info("Loading Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"],
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])

        # load method
        self.params["method_params"]["dir"] = dir
        self.method = globals()[self.params["method_class"]](
            self.network, self.diseases_dict, self.params["method_params"])
예제 #5
0
    def _run(self):
        """
        Run the experiment.
        """
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 

        logging.info("Loading PPI Matrices...")
        self.ppi_matrices = load_network_matrices(self.params["ppi_matrices"], 
                                                  self.network)

        logging.info("Building Degree Buckets...")
        self.degree_to_bucket = build_degree_buckets(self.network,
                                                     min_len=self.params["min_bucket_len"])

        logging.info("Running Experiment...")
        self.results = []

        if self.params["n_processes"] > 1:
            with tqdm(total=len(self.diseases)) as t: 
                p = Pool(self.params["n_processes"])
                for results in p.imap(process_disease_wrapper, self.diseases.values()):
                    self.results.append(results)
                    t.update()
        else:
            with tqdm(total=len(self.diseases)) as t: 
                for disease in self.diseases.values():
                    results = self.process_disease(disease)
                    self.results.append(results)
                    t.update()
        self.results = pd.DataFrame(self.results)
예제 #6
0
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        # Log title 
        logging.info("Disease Protein Prediction")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        
        logging.info("Loading Disease Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 
        
        logging.info("Loading enrichment study...")
        obodag = GODag(self.params["go_path"])
        geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606])
        self.enrichment_study = GOEnrichmentStudy(self.network.get_names(),
                                                  geneid2go,
                                                  obodag,
                                                  log=None,
                                                  **self.params["enrichment_params"])

        logging.info("Loading predictions...")
        self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), 
                                                  index_col=0) 
                                for name, preds in self.params["method_to_preds"].items()}
        
        outputs_path = os.path.join(self.dir, "outputs.pkl")
        if os.path.exists(outputs_path):
            logging.info("Loading outputs...")
            with open(outputs_path, 'rb') as f:
                self.outputs = pickle.load(f)
        else:
            self.outputs = {}
예제 #7
0
 def __init__(self, dir, params):
     """
     """
     super().__init__(dir, params)
     logging.info("Network Matrix Builder")
     logging.info("Sabri Eyuboglu  -- SNAP Group")
     logging.info("======================================")
     logging.info("Loading PPI Network...")
     self.network = Network(params["ppi_network"])
     self.deg_fn = params["deg_fn"]
     self.col_norm = params["col_norm"]
     self.row_norm = params["row_norm"]
     self.self_loops = params["self_loops"]
예제 #8
0
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'),
                   level=logging.INFO,
                   console=True)

        # Log title
        logging.info("Metric Significance of Diseases in the PPI Network")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        logging.info("Loading Disease Associations...")
        self.diseases = load_diseases(self.params["associations_path"],
                                      self.params["disease_subset"],
                                      exclude_splits=['none'])

        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"])

        logging.info("Loading Predictions...")
        self.method_to_preds = {
            name: pd.read_csv(os.path.join(preds, "predictions.csv"),
                              index_col=0)
            for name, preds in self.params["method_to_preds"].items()
        }

        logging.info("Loading Protein Data...")
        self.field_to_protein_data = {
            field: load_mapping(path=config["path"], **config["args"])
            for field, config in self.params["field_to_protein_data"].items()
        }
예제 #9
0
    def _run(self):
        """
        Run the experiment.
        """
        logging.info("Loading network...")
        network = Network(self.params["ppi_network"])

        logging.info("Loading molecule associations...")
        associations = {}
        for association_path in self.params["association_paths"]:
            dct = load_diseases(association_path)
            associations.update(dct)

        association_matrix, _ = build_disease_matrix(associations, network)

        association_jaccard = compute_jaccard(association_matrix.T)

        mi_matrix = mi_matrix = load_network_matrices(
            {"mi": self.params["mi_dir"]}, network=network)["mi"]

        mi_values = mi_matrix[np.triu_indices(mi_matrix.shape[0], k=1)]
        adj_values = network.adj_matrix[np.triu_indices(
            network.adj_matrix.shape[0], k=1)]
        jaccard_values = association_jaccard[np.triu_indices(
            association_jaccard.shape[0], k=1)]

        k = adj_values.sum().astype(int)
        statistic, pvalue = ttest_rel(
            jaccard_values[np.argpartition(mi_values, -k)[-k:]],
            jaccard_values[np.argpartition(adj_values, -k)[-k:]],
        )

        metrics = {
            "test":
            "ttest_rel",
            "statistic":
            statistic,
            "pvalue":
            pvalue,
            "mi_mean":
            jaccard_values[np.argpartition(mi_values, -k)[-k:]].mean(),
            "adj_mean":
            jaccard_values[np.argpartition(adj_values, -k)[-k:]].mean(),
        }

        with open(os.path.join(self.dir, "results.json"), "w") as f:
            json.dump(metrics, f, indent=4)
예제 #10
0
    def _run(self):
        """
        Run the experiment.
        """
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"])

        logging.info("Loading PPI Matrices...")
        self.ppi_matrices = load_network_matrices(
            self.params["ppi_matrices"], self.network
        )

        logging.info("Building Degree Buckets...")
        self.degree_to_bucket = build_degree_buckets(
            self.network, min_len=self.params["min_bucket_len"]
        )

        logging.info("Running Experiment...")
        self.results = []
        self.indices = []

        if self.params["n_processes"] > 1:
            with tqdm(total=len(self.diseases)) as t:
                p = Pool(self.params["n_processes"])
                for indices, results in p.imap(
                    process_disease_wrapper, self.diseases.values()
                ):
                    if indices is None:
                        continue
                    self.indices.extend(indices)
                    self.results.extend(results)
                    t.update()
        else:
            with tqdm(total=len(self.diseases)) as t:
                for disease in self.diseases.values():
                    indices, results = self.process_disease(disease)
                    if indices is None:
                        continue
                    self.indices.extend(indices)
                    self.results.extend(results)
                    t.update()

        index = pd.MultiIndex.from_tuples(self.indices, names=["disease", "protein"])
        self.results = pd.DataFrame(self.results, index=index)
예제 #11
0
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
예제 #12
0
class DrugTarget(Experiment):
    """
    Class for running experiment that conducts enrichment of gene ontology terms in 
    pathways in the PPI network. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)

                
        logging.info("Loading drugs...")
        self.drug_to_targets = load_drug_targets(params["drug_targets_path"])
        
    
    def compute_drug_counts(self):
        """
        Get np.array num_drugs where num_drugs[u] gives the count of drugs that target node u. 
        """
        protein_to_drug_count = Counter()
        for drug, targets in self.drug_to_targets.items():
            for target in targets:
                protein_to_drug_count[target] += 1
        node_to_drug_count = {self.network.get_node(protein): count 
                              for protein, count 
                              in protein_to_drug_count.items() 
                              if protein in self.network}
        num_drugs = np.zeros(len(self.network))
        num_drugs[list(node_to_drug_count)] = list(node_to_drug_count.values())
        
        self.drug_counts = np.array(num_drugs)
    
    def compute_weight_stats(self, proteins=None):
        """
        """
        if proteins is None:
            proteins = np.arange(len(self.network))
        return {
            "mean": np.mean(self.ci_weights_norm[proteins]),
            "median": np.median(self.ci_weights_norm[proteins]),
            "std": np.std(self.ci_weights_norm[proteins])
        }
    
    def compute_frac_targets(self, proteins=None):
        """
        """
        if proteins is None:
            proteins = np.arange(len(self.network))
        
        return np.mean((self.drug_counts > 0)[proteins])

    def frac_targets_ks_test(self, proteins_a, proteins_b):
        targets_a = (self.drug_counts > 0)[proteins_a]
        targets_b = (self.drug_counts > 0)[proteins_b]
        return ks_2samp(targets_a, targets_b)
    
    
    def _run(self):
        """
        Run the experiment.
        """     
        results = {"norm_weight": {},
                   "frac_targets": {}}
        self.compute_drug_counts()
        
        target_proteins = np.where(self.drug_counts != 0)
        not_target_proteins = np.where(self.drug_counts == 0)
        
        results["norm_weight"]["all"] = self.compute_weight_stats()
        results["norm_weight"]["target"] = self.compute_weight_stats(target_proteins)
        results["norm_weight"]["not_target"] = self.compute_weight_stats(not_target_proteins)
        
        top_proteins = np.argsort(self.ci_weights_norm)[-self.params["top_k"]:]
        bottom_proteins = np.argsort(self.ci_weights_norm)[:-self.params["top_k"]]
        
        results["frac_targets"]["top"] = self.compute_frac_targets(top_proteins)
        results["frac_targets"]["bottom"] = self.compute_frac_targets(bottom_proteins)
        
        results["frac_targets"]["pvalue"] = self.frac_targets_ks_test(top_proteins, bottom_proteins).pvalue
        
        with open(os.path.join(self.dir, "results.json"), 'w') as f: 
            json.dump(results, f, indent=4)
        
    
    def plot_drug_weight_dist(self, protein_sets, save="weight_dist.pdf"):
        """
        """
        weights = np.minimum(1.0, self.ci_weights_norm)
        
        prepare_sns(sns, kwargs={"font_scale": 1.4,
                         "rc": {'figure.figsize':(6, 4)}})
        for name, proteins in protein_sets.items():
            sns.distplot(weights[proteins], 
                 kde=False, hist=True, norm_hist=True, bins=25, 
                 hist_kws={"range":(-0.25, 1.1),
                           "alpha": 0.8},
                         label=name)

        sns.despine()
        plt.xscale('linear')
        plt.yscale('log')
        plt.legend()
        plt.xlabel(r"Degree-normalized LCI weight, $\frac{w_z}{\sqrt{d_z}}$")
        plt.ylabel("Density")
        plt.tight_layout()
        
        if save is not None:
            plt.savefig(os.path.join(self.dir, "_figures", save))
    
    def plot_frac_drug_weight(self, protein_set, save="frac_weight.pdf"):
        num_bins = 22
        weights = np.minimum(1.1, self.ci_weights_norm)
        drug_hist, drug_bins = np.histogram(weights[protein_set],
                                            range=(-0.3, 1.5), bins=num_bins)
        all_hist, all_bins = np.histogram(weights, 
                                          range=(-0.3, 1.5), bins=num_bins)
        #plt.bar(drug_bins[:-1], drug_hist)

        assert(np.all(drug_bins == all_bins))

        frac_hist = drug_hist / all_hist
        plt.bar(x=drug_bins[:-1], height=frac_hist, width=(drug_bins[1] - drug_bins[0]))
예제 #13
0
class EssentialGeneAnalysis(Experiment):
    """
    Class for running experiment that conducts enrichment of gene ontology terms in 
    pathways in the PPI network. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)

                
        logging.info("Loading essential genes...")
        self.essential_proteins = load_essential_proteins(params["essential_genes_path"])
        self.essential_nodes = self.network.get_nodes(self.essential_proteins)
        self.non_essential_nodes = [node for node in self.network.get_nodes()
                                    if node not in self.essential_nodes]
        
        self.essential_array = np.zeros(len(self.network))
        self.essential_array[self.essential_nodes] = 1

        
    def compute_weight_stats(self, nodes=None, norm=True):
        """
        """
        weights = self.ci_weights_norm if norm else self.ci_weights
        if nodes is None:
            nodes = np.arange(len(self.network))
        return {
            "mean": np.mean(weights[nodes]),
            "median": np.median(weights[nodes]),
            "std": np.std(weights[nodes])
        }

    def compute_frac_essential(self, nodes): 
        """
        """
        return np.mean(self.essential_array[nodes])
         
        
    def plot_weight_dist(self, node_sets):
        """
        """
        for name, nodes in node_sets.items():
            sns.distplot(self.ci_weights[nodes], 
                 kde=False, hist=True, norm_hist=True, bins=15, 
                 hist_kws={"range":(-0.4, 0.8)}, label=name)

        plt.xscale('linear')
        plt.yscale('linear')
        plt.legend()
        plt.xlabel(r"$\frac{w_k}{\sqrt{d_k}}$")
        plt.ylabel("# of proteins [normalized]")
        
예제 #14
0
class FunctionalEnrichmentAnalysis(Experiment):
    """
    """
    
    def __init__(self, dir, params):
        """
        """
        super().__init__(dir, params)
        
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)
        
        logging.info("Loading enrichment study...")
        geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606])
        obodag = GODag("data/go/go-basic.obo")
        self.go_study = GOEnrichmentStudy(self.network.get_names(),
                                          geneid2go,
                                          obodag, 
                                          propagate_counts = True,
                                          alpha = 0.05,
                                          methods = ['fdr_bh'])

    
    def run_study(self):
        """
        """
        top_nodes = np.argsort(self.ci_weights_norm)[-self.params["top_k"]:]
        top_proteins = self.network.get_names(top_nodes)
        self.raw_results = self.go_study.run_study(set(top_proteins))  
    
    def to_csv(self):
        """
        """
        self.results = []
        for r in self.raw_results:
            self.results.append({
                "name": r.name,
                "pvalue": r.p_fdr_bh,
                "goterm_id": r.goterm.id
            })
        self.results = sorted(self.results, key = lambda x: x["pvalue"])
        
        results_df = pd.DataFrame(self.results)
        results_df.to_csv(os.path.join(self.dir, "all_terms.csv"))            
예제 #15
0
class DiseaseSubgraph(Experiment):
    """
    Class for running experiment that assess the significance of a network metric
    between disease proteins. Uses the method described in Guney et al. for generating
    random subgraph. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'),
                   level=logging.INFO,
                   console=True)

        # Log title
        logging.info("Metric Significance of Diseases in the PPI Network")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        logging.info("Loading Disease Associations...")
        self.diseases = load_diseases(self.params["associations_path"],
                                      self.params["disease_subset"],
                                      exclude_splits=['none'])

        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"])

        logging.info("Loading Predictions...")
        self.method_to_preds = {
            name: pd.read_csv(os.path.join(preds, "predictions.csv"),
                              index_col=0)
            for name, preds in self.params["method_to_preds"].items()
        }

        logging.info("Loading Protein Data...")
        self.field_to_protein_data = {
            field: load_mapping(path=config["path"], **config["args"])
            for field, config in self.params["field_to_protein_data"].items()
        }

    def compute_disease_subgraph(self, disease):
        """ Get the disease subgraph of 
        Args:
            disease: (Disease) A disease object
        """
        node_to_roles = {}
        disease_nodes = disease.to_node_array(self.network)
        for disease_node in disease_nodes:
            node_to_roles[disease_node] = "disease"

        disease_node_to_nbrs = {
            node: set(self.network.nx.neighbors(node))
            for node in disease_nodes
        }

        for method, preds in self.method_to_preds.items():
            top_pred_proteins = set(
                map(
                    int, preds.loc[disease.id].sort_values(
                        ascending=False).index[:self.params["num_preds"]]))
            top_pred_nodes = self.network.get_nodes(top_pred_proteins)

            for pred_node in top_pred_nodes:
                if pred_node not in node_to_roles:
                    node_to_roles[pred_node] = f"pred_{method}"
                pred_nbrs = set(self.network.nx.neighbors(pred_node))
                for disease_node in disease_nodes:
                    disease_nbrs = disease_node_to_nbrs[disease_node]
                    common_nbrs = disease_nbrs & pred_nbrs
                    for common_nbr in common_nbrs:
                        if common_nbr not in node_to_roles:
                            node_to_roles[common_nbr] = f"common_pred_{method}"

        # the set of nodes intermediate between nodes in the
        for a, node_a in enumerate(disease_nodes):
            for b, node_b in enumerate(disease_nodes):
                # avoid repeat pairs
                if a >= b:
                    continue
                common_nbrs = disease_node_to_nbrs[
                    node_a] & disease_node_to_nbrs[node_b]
                for common_nbr in common_nbrs:
                    if common_nbr not in node_to_roles:
                        node_to_roles[common_nbr] = "common_disease"

        # get induced subgraph
        subgraph = self.network.nx.subgraph(node_to_roles.keys())

        return subgraph, node_to_roles

    def write_subgraph(self, disease, node_to_roles, subgraph, delimiter='\t'):
        """
        """
        directory = os.path.join(self.dir, 'diseases', disease.id)
        if not os.path.exists(directory):
            os.makedirs(directory)

        with open(os.path.join(directory, f"subgraph_{disease.id}.txt"),
                  "w") as f:
            f.write(delimiter.join(["node_1", "node_2", "roles"]) + '\n')
            for edge in subgraph.edges():
                items = [str(edge[0]), str(edge[1])]

                # dd interaction type
                roles = node_to_roles[edge[0]] + "-" + node_to_roles[edge[1]]
                items.append(roles)

                f.write(delimiter.join(items) + '\n')

    def write_protein_data(self, disease, node_to_roles):
        """
        """
        directory = os.path.join(self.dir, 'diseases', disease.id)
        if not os.path.exists(directory):
            os.makedirs(directory)

        protein_data = []
        for node, roles in node_to_roles.items():
            protein_id = self.network.get_names([node])[0]
            node_dict = {
                "node_id": node,
                "protein_id": protein_id,
                "role": roles,
                "degree": self.network.nx.degree(node)
            }

            for field, data in self.field_to_protein_data.items():
                if not ("weight" in field and "common" not in roles):
                    node_dict[field] = data.get(protein_id, "")
            protein_data.append(node_dict)

        df = pd.DataFrame(protein_data)
        df = df.set_index('node_id')
        df.to_csv(os.path.join(directory, f"data_{disease.id}.csv"))

    def process_disease(self, disease):
        """
        Generates null model for disease and computes 
        Args:
            disease (Disease) the current disease 
        """
        subgraph, node_to_roles = self.compute_disease_subgraph(disease)

        disease_directory = os.path.join(self.dir, 'diseases', disease.id)
        if not os.path.exists(disease_directory):
            os.makedirs(disease_directory)

        self.write_subgraph(disease, node_to_roles, subgraph)
        self.write_protein_data(disease, node_to_roles)

    def _run(self):
        """
        Run the experiment.
        """

        logging.info("Running Experiment...")
        self.results = []

        if self.params["n_processes"] > 1:
            with tqdm(total=len(self.diseases)) as t:
                p = Pool(self.params["n_processes"])
                for results in p.imap(process_disease_wrapper,
                                      self.diseases.values()):
                    self.results.append(results)
                    t.update()
        else:
            with tqdm(total=len(self.diseases)) as t:
                for disease in self.diseases.values():
                    results = self.process_disease(disease)
                    self.results.append(results)
                    t.update()
        self.results = pd.DataFrame(self.results)
예제 #16
0
class EvaluateMethod(Experiment):
    """
    Class for the disease protein prediction experiment
    """
    def __init__(self, dir, params):
        """ Initialize the disease protein prediction experiment 
        Args: 
            dir (string) The directory where the experiment should be run
        """
        super().__init__(dir, params)

        # set the logger
        set_logger(os.path.join(dir, 'experiment.log'),
                   level=logging.INFO,
                   console=True)

        # log Title
        logging.info("Node set expansion evaluation")
        logging.info(
            "Sabri Eyuboglu, Marinka Zitnik and Jure Leskovec  -- SNAP Group")
        logging.info("======================================")

        # load data from params file
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"],
                               remove_nodes=self.params.get("remove_nodes", 0),
                               remove_edges=self.params.get("remove_edges", 0))
        logging.info("Loading Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"],
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])

        # load method
        self.params["method_params"]["dir"] = dir
        self.method = globals()[self.params["method_class"]](
            self.network, self.diseases_dict, self.params["method_params"])

    def _run(self):
        """ Run the disease protein prediction experiment
        Args: 
            dir (string) The directory where the experiment should be run
        """
        logging.info("Running Experiment...")
        disease_to_metrics, disease_to_ranks = {}, {}
        diseases = list(self.diseases_dict.values())
        diseases.sort(key=lambda x: x.split)
        if self.params["n_processes"] > 1:
            p = Pool(self.params["n_processes"])
            with tqdm(total=len(self.diseases_dict)) as t:
                for n_finished, (disease, metrics, ranks) in enumerate(
                        p.imap(run_dpp_wrapper, diseases), 1):
                    if metrics != None or ranks != None:
                        disease_to_ranks[disease] = ranks
                        disease_to_metrics[disease] = metrics
                        t.set_postfix(str="{} Recall-at-100: {:.2f}%".format(
                            disease.id, 100 * metrics["Recall-at-100"]))
                    else:
                        t.set_postfix(str="{} Not Recorded".format(disease.id))
                    t.update()

        else:
            with tqdm(total=len(self.diseases_dict)) as t:
                for n_finished, disease in enumerate(diseases):
                    disease, metrics, ranks = self.run_dpp(disease)
                    if metrics != None or ranks != None:
                        disease_to_metrics[disease] = metrics
                        disease_to_ranks[disease] = ranks
                        t.set_postfix(str="{} Recall-at-100: {:.2f}%".format(
                            disease.id, 100 * metrics["Recall-at-100"]))
                    else:
                        t.set_postfix(str="{} Not Recorded".format(disease.id))
                    t.update()

        self.results = {
            "metrics": disease_to_metrics,
            "ranks": disease_to_ranks
        }

    def compute_node_scores(self, train_nodes, disease):
        """ Get score 
        Args:
            disease: (Disease) A disease object
        """
        scores = self.method.compute_scores(train_nodes, disease)
        return scores

    def run_dpp(self, disease):
        """ Perform k-fold cross validation on disease protein prediction on disease
        Args:
            disease: (Disease) A disease object
        """
        disease_nodes = disease.to_node_array(self.network)
        # Ensure that there are at least 2 proteins
        if disease_nodes.size <= 1:
            return disease, None, None
        labels = np.zeros((len(self.network), 1))
        labels[disease_nodes, 0] = 1
        metrics = {}

        # Perform k-fold cross validation
        n_folds = (disease_nodes.size if
                   (self.params["n_folds"] < 0
                    or self.params["n_folds"] > len(disease_nodes)) else
                   self.params["n_folds"])
        kf = KFold(n_splits=n_folds, shuffle=False)

        for train_indices, test_indices in kf.split(disease_nodes):
            train_nodes = disease_nodes[train_indices]
            val_nodes = disease_nodes[test_indices]

            # compute node scores
            scores = self.compute_node_scores(train_nodes, disease)

            # compute the metrics of target node
            compute_metrics(metrics, labels, scores, train_nodes, val_nodes)

        avg_metrics = {
            name: np.mean(values)
            for name, values in metrics.items()
        }
        proteins = self.network.get_names(metrics["Nodes"])
        ranks = metrics["Ranks"]
        proteins_to_ranks = {
            protein: ranks
            for protein, ranks in zip(proteins, ranks)
        }
        return disease, avg_metrics, proteins_to_ranks

    def save_results(self):
        write_metrics(self.dir, self.results["metrics"])
        write_ranks(self.dir, self.results["ranks"])
예제 #17
0
class GOEnrichment(Experiment):
    """
    Class for running experiment that conducts enrichment of gene ontology terms in 
    pathways in the PPI network. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        # Log title 
        logging.info("Disease Protein Prediction")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        
        logging.info("Loading Disease Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 
        
        logging.info("Loading enrichment study...")
        obodag = GODag(self.params["go_path"])
        geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606])
        self.enrichment_study = GOEnrichmentStudy(self.network.get_names(),
                                                  geneid2go,
                                                  obodag,
                                                  log=None,
                                                  **self.params["enrichment_params"])

        logging.info("Loading predictions...")
        self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), 
                                                  index_col=0) 
                                for name, preds in self.params["method_to_preds"].items()}
        
        outputs_path = os.path.join(self.dir, "outputs.pkl")
        if os.path.exists(outputs_path):
            logging.info("Loading outputs...")
            with open(outputs_path, 'rb') as f:
                self.outputs = pickle.load(f)
        else:
            self.outputs = {}
        
    def run_study(self, proteins):
        """
        """
        results = self.enrichment_study.run_study(proteins)
        term_to_pval = {r.goterm.name: r.p_fdr_bh for r in results}

        return term_to_pval
    
    def compute_spearman_correlation(self, a_term_to_pval, b_term_to_pval):
        """
        """
        terms = list(a_term_to_pval.keys())
        sp_corr, sp_pval = spearmanr([a_term_to_pval[term] for term in terms],
                                     [b_term_to_pval[term] for term in terms])
        return sp_corr, sp_pval

    def process_disease(self, disease):
        """
        """
        results = {}
        output = {}
        # compute method scores for disease
        disease_proteins = set(self.diseases_dict[disease.id].proteins)

        if disease.id in self.outputs:
            disease_term_to_pval = self.outputs[disease.id]["disease"]
        else:
            disease_term_to_pval = self.run_study(disease_proteins)
        output["disease"] = disease_term_to_pval

        disease_terms = set([term for term, pval 
                             in disease_term_to_pval.items() if pval < 0.05])
        top_disease_terms = set([term for term, _
                                 in sorted(disease_term_to_pval.items(), 
                                           key=lambda x: x[1])[:self.params["top_k"]]])
        
        results = {"disease_name": disease.name, 
                   "disease_num_significant": len(disease_terms),
                   "disease_top_{}".format(self.params['top_k']): top_disease_terms}

        # number of predictions to be made 
        num_preds = (len(disease_proteins) 
                     if self.params["num_preds"] == -1 
                     else self.params["num_preds"])
        
        for name, preds in self.method_to_preds.items():
            pred_proteins = set(map(int, preds.loc[disease.id]
                                              .sort_values(ascending=False)
                                              .index[:num_preds]))

            if disease.id in self.outputs:
                pred_term_to_pval = self.outputs[disease.id][name]
            else: 
                pred_term_to_pval = self.run_study(pred_proteins)
            output[name] = pred_term_to_pval

            pred_terms = set([term for term, pval 
                              in pred_term_to_pval.items() if pval < 0.05])
            top_pred_terms = set([term for term, _
                                  in sorted(pred_term_to_pval.items(), 
                                            key=lambda x: x[1])[:self.params["top_k"]]])

            jaccard = (len(disease_terms & pred_terms) / len(disease_terms | pred_terms) 
                       if len(disease_terms | pred_terms) != 0 else 0)
            sp_corr, sp_pval = self.compute_spearman_correlation(disease_term_to_pval,
                                                                 pred_term_to_pval)

            results[f"{name}_num_significant"] = len(pred_terms)
            results[f"{name}_top_{self.params['top_k']}"] = top_pred_terms
            results[f"{name}_jaccard_sim"] = jaccard
            results[f"{name}_sp_corr"] = sp_corr
            results[f"{name}_sp_pval"] = sp_pval

        return disease, results, output

    def _run(self):
        """
        Run the experiment.
        """        
        results = []
        indices = []
        outputs = {}

        diseases = list(self.diseases_dict.values())
        diseases.sort(key=lambda x: x.split)
        if self.params["n_processes"] > 1:
            with tqdm(total=len(diseases)) as t: 
                p = Pool(self.params["n_processes"])
                for disease, result, output in p.imap(process_disease_wrapper, diseases):
                    results.append(result)
                    indices.append(disease.id)
                    outputs[disease.id] = output
                    t.update()
        else:
            with tqdm(total=len(diseases)) as t: 
                for disease in diseases:
                    disease, result, output = self.process_disease(disease)
                    results.append(result)
                    indices.append(disease.id)
                    outputs[disease.id] = output

                    t.update()
        
        self.outputs = outputs
        self.results = pd.DataFrame(results, index=indices)

    def save_results(self, summary=True):
        """
        Saves the results to a csv using a pandas Data Fram
        """
        print("Saving Results...")
        self.results.to_csv(os.path.join(self.dir, 'results.csv'))

        #if self.params["save_enrichment_results"]:
        #    with open(os.path.join(self.dir,'outputs.pkl'), 'wb') as f:
        #        pickle.dump(self.outputs, f)
            
    def load_results(self):
        """
        Loads the results from a csv to a pandas Data Frame.
        """
        print("Loading Results...")
        self.results = pd.read_csv(os.path.join(self.dir, 'results.csv'))
예제 #18
0
class DPPPredict(Experiment):
    """
    Class for running experiment that conducts enrichment of gene ontology terms in 
    pathways in the PPI network. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), level=logging.INFO, console=True)

        # Log title 
        logging.info("Disease Protein Prediction")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        
        logging.info("Loading Disease Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 

        self.params["method_params"]["dir"] = dir
        self.method = globals()[self.params["method_class"]](self.network, 
                                                             self.diseases_dict, 
                                                             self.params["method_params"])
    
    def process_disease(self, disease):
        """
        """
        # compute method scores for disease
        disease_nodes = disease.to_node_array(self.network)
        scores = self.method.compute_scores(disease_nodes, disease)

        # zero out scores for disease_nodes
        scores[disease_nodes] = 0

        results = {self.network.get_names([node])[0]: score 
                   for node, score in enumerate(scores)}
        
        return disease, results

    def _run(self):
        """
        Run the experiment.
        """        
        results = []
        indices = []

        diseases = list(self.diseases_dict.values())
        diseases.sort(key=lambda x: x.split)
        if self.params["n_processes"] > 1:
            with tqdm(total=len(diseases)) as t: 
                p = Pool(self.params["n_processes"])
                for disease, result in p.imap(process_disease_wrapper, diseases):
                    results.append(result)
                    indices.append(disease.id)
                    t.update()
        else:
            with tqdm(total=len(diseases)) as t: 
                for disease in diseases:
                    disease, result = self.process_disease(disease)
                    results.append(result)
                    indices.append(disease.id)
                    t.update()
        
        self.results = pd.DataFrame(results, index=indices)


    def save_results(self, summary=True):
        """
        Saves the results to a csv using a pandas Data Fram
        """
        print("Saving Results...")
        self.results.to_csv(os.path.join(self.dir, 'predictions.csv'))
    
    def load_results(self):
        """
        Loads the results from a csv to a pandas Data Frame.
        """
        print("Loading Results...")
        self.results = pd.read_csv(os.path.join(self.dir, 'predictions.csv'))