Exemplo n.º 1
0
    def _process_input_files(self):
        interactions_file = open(self.args[0], "rb")
        annotations_file = open(self.args[1], "rb")
        # Create interaction graph
        logger.info("Parsing interactions from {0}.".format(interactions_file.name))
        self.interactions_graph = parsers.parse_interactions_file_to_graph(interactions_file)
        logger.info(
            "{0} genes (products) with {1} interactions "
            "parsed.".format(len(self.interactions_graph), self.interactions_graph.number_of_edges())
        )

        # Create dictionary of annotations to genes, but only for genes in
        # the interaction graph
        logger.info("Parsing annotations from {0}.".format(annotations_file.name))
        if annotations_file.name.endswith(".gmt"):
            self.annotations_dict = parsers.parse_gmt_to_dict(annotations_file)
        else:
            self.annotations_dict = parsers.parse_annotations_to_dict(annotations_file)

        self.annotations_stats = structures.get_annotations_stats(self.annotations_dict)
        logger.info(
            (
                "{num_total_annotations} annotations processed, "
                "for {num_genes} genes (or gene products), by "
                "{num_annotation_terms} different terms.".format(**self.annotations_stats)
            )
        )

        # Remove from the graph the set of nodes that have no annotation.
        logger.info("Pruning unannotated genes (products) from " "interaction graph.")
        self.interactions_graph.prune_unannotated_genes(self.annotations_dict)
        logger.info(
            "{0} genes (products) with {1} interactions "
            "remaining in graph.".format(len(self.interactions_graph), self.interactions_graph.number_of_edges())
        )

        # Remove from the annotations any genes which are not in the graph.
        logger.info("Removing genes with no interactions from the " "sets of annotated genes.")
        self.interactions_graph.prune_non_network_genes_from_annotations(self.annotations_dict)
        self.annotations_stats = structures.get_annotations_stats(self.annotations_dict)
        logger.info(
            "{num_total_annotations} annotations, "
            "for {num_genes} genes (or gene products), by "
            "{num_annotation_terms} different terms "
            "remain.".format(**self.annotations_stats)
        )

        # Sanity test: the number of genes (products) in the
        # interactions_graph should equal the union of all the sets in
        # annotations_dict
        assert len(self.interactions_graph) == self.annotations_stats["num_genes"], (
            "interactions_graph and annotations_dict have unequal " "numbers of genes!"
        )

        for term, genes in self.annotations_dict.iteritems():
            assert len(genes) > 0, "%s has no genes!" % term

        interactions_file.close()
        annotations_file.close()
Exemplo n.º 2
0
    def _process_input_files(self):
        super(ContextualCli, self)._process_input_files()
        expression_file = open(self.args[2], "rb")
        # Get the expression values.
        logger.info("Parsing expression values from %s." % expression_file.name)
        expression_values = parsers.parse_expression_file(expression_file)
        # Apply the expression values to the interaction graph, removing
        # any nodes lacking expression values from the graph.
        logger.info("Removing genes without expression values from " "interaction graph and annotation sets.")
        self.interactions_graph.apply_expression_values_to_interactions_graph(expression_values)
        # Re-synchronize the interaction graph and annotations dictionary.
        self.interactions_graph.prune_non_network_genes_from_annotations(self.annotations_dict)
        expression_file.close()

        self.annotations_stats = structures.get_annotations_stats(self.annotations_dict)
        gene_stats = {"num_interactions": self.interactions_graph.number_of_edges()}
        gene_stats.update(self.annotations_stats)
        logger.info(
            "%(num_genes)d genes (products) with "
            "%(num_interactions)d interactions remaining in "
            "graph, with %(num_total_annotations)d annotations by "
            "%(num_annotation_terms)d terms." % gene_stats
        )