class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue(any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue(len(self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO) self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot])) def test_download_gz_file(self): test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"), file_source_url="file://" + os.path.join( self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz")) self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt")) def test_gene_data_functions(self): self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True), Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)]) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=False)]) == 1) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=False)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=True)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=True)]) == 4) def test_get_human_gene_props(self): human_gene_props = self.df.get_human_gene_props() self.assertTrue(len(human_gene_props) > 0) def test_get_ensembl_hgnc_ids_map(self): ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map() self.assertTrue(len(ensembl_hgnc_ids_map) > 0) def test_set_ontology(self): ontology = OntologyFactory().create() for i in range(4): ontology.add_node(i, 'node' + str(i)) ontology.add_parent(1, 0) ontology.add_parent(2, 0) ontology.add_parent(3, 0) self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser) self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes())) def test_set_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901", "", "F", "EXP", None, "WB", "")) associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertTrue(self.df.go_associations) def test_remap_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
def main(): parser = argparse.ArgumentParser( description="Generate gene descriptions for wormbase") parser.add_argument("-c", "--config-file", metavar="config_file", dest="config_file", type=str, default="config.yml", help="configuration file. Default ./config.yaml") parser.add_argument( "-C", "--use-cache", dest="use_cache", action="store_true", default=False, help= "Use cached source files from cache_location specified in config file. Download them from " "raw_file_source (configured in config file) if not yet cached") parser.add_argument( "-l", "--log-file", metavar="log_file", dest="log_file", type=str, default=None, help="path to the log file to generate. Default ./genedescriptions.log" ) parser.add_argument( "-L", "--log-level", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="set the logging level") parser.add_argument("-t", "--textpressoapi-token", metavar="textpresso_token", dest="textpresso_token", type=str, help="Texpresso api token") parser.add_argument("-o", "--output-formats", metavar="output_formats", dest="output_formats", type=str, nargs="+", default=["ace", "txt", "json", "tsv"], help="file formats to generate. Accepted values " "are: ace, txt, json, tsv") args = parser.parse_args() conf_parser = GenedescConfigParser(args.config_file) logging.basicConfig(filename=args.log_file, level=args.log_level, format='%(asctime)s - %(name)s - %(levelname)s:' '%(message)s', force=True) logger = logging.getLogger("WB Gene Description Pipeline") organisms_list = conf_parser.get_wb_organisms_to_process() human_genes_props = DataManager.get_human_gene_props() api_manager = APIManager(textpresso_api_token=args.textpresso_token) for organism in organisms_list: logger.info("Processing organism " + organism) species = conf_parser.get_wb_organisms_info() dm, sister_df, df_agr = load_data(organism=organism, conf_parser=conf_parser) desc_writer = DescriptionsWriter() desc_writer.overall_properties.species = organism desc_writer.overall_properties.release_version = conf_parser.get_wb_release( )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1) desc_writer.overall_properties.date = datetime.date.today().strftime( "%B %d, %Y") for gene in dm.get_gene_data(): logger.debug("Generating description for gene " + gene.name) gene_desc = GeneDescription(gene_id=gene.id, config=conf_parser, gene_name=gene.name, add_gene_name=False) selected_orthologs, orth_sent = get_best_orthologs_and_sentence( dm=dm, orth_fullnames=dm.orth_fullnames, human_genes_props=human_genes_props, gene_desc=gene_desc, api_manager=api_manager, config=conf_parser) set_gene_ontology_module(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene) set_tissue_expression_sentence(dm=dm, gene=gene, conf_parser=conf_parser, gene_desc=gene_desc) if not gene_desc.description: set_expression_cluster_sentence(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene, api_manager=api_manager) set_disease_module(df=dm, conf_parser=conf_parser, gene=gene, gene_desc=gene_desc) if not gene_desc.go_description: set_information_poor_sentence( orth_fullnames=dm.orth_fullnames, selected_orthologs=selected_orthologs, conf_parser=conf_parser, human_df_agr=df_agr, gene_desc=gene_desc, dm=dm, gene=gene) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.ORTHOLOGY, description=orth_sent) if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \ dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname], sister_species_data_fetcher=sister_df, ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"])[0]: set_sister_species_sentence( dm=dm, sister_sp_fullname=dm.sister_sp_fullname, sister_df=sister_df, species=species, organism=organism, gene_desc=gene_desc, conf_parser=conf_parser, gene=gene) desc_writer.add_gene_desc(gene_desc) logger.info("All genes processed for " + organism) date_prefix = datetime.date.today().strftime("%Y%m%d") if "json" in args.output_formats: logger.info("Writing descriptions to json") desc_writer.write_json(os.path.join( conf_parser.get_out_dir(), date_prefix + "_" + organism + ".json"), include_single_gene_stats=True, data_manager=dm) if "txt" in args.output_formats: logger.info("Writing descriptions to txt") desc_writer.write_plain_text( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".txt")) if "tsv" in args.output_formats: logger.info("Writing descriptions to tsv") desc_writer.write_tsv( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".tsv")) if "ace" in args.output_formats: logger.info("Writing descriptions to ace") curators = ["WBPerson324", "WBPerson37462"] release_version = conf_parser.get_wb_release() desc_writer.write_ace( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".ace"), curators, release_version)