def test_gaf_2_1_simple_terms(): line = ["SGD", "S000000819", "AFG3", "", "GO:0006259", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"] ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) p.make_internal_cell_component_closure() parsed = gafparser.to_association(line) assoc = p.upgrade_empty_qualifier(parsed.associations[0]) assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0002264") line = ["SGD", "S000000819", "AFG3", "", "GO:0042393", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"] ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) p.make_internal_cell_component_closure() parsed = gafparser.to_association(line) assoc = p.upgrade_empty_qualifier(parsed.associations[0]) assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0002327") line = ["SGD", "S000000819", "AFG3", "", "GO:0005773", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"] ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) p.make_internal_cell_component_closure() parsed = gafparser.to_association(line) assoc = p.upgrade_empty_qualifier(parsed.associations[0]) assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0001025")
def load_associations_from_file(self, associations_type: DataType, associations_url: str, associations_cache_path: str, config: GenedescConfigParser) -> None: """load go associations from file Args: associations_type (DataType): the type of associations to set associations_url (str): url to the association file associations_cache_path (str): path to cache file for the associations config (GenedescConfigParser): configuration object where to read properties """ assoc_config = AssocParserConfig(remove_double_prefixes=True, paint=True) if associations_type == DataType.GO: logger.info("Loading GO associations from file") self.go_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.go_ontology) self.go_associations = self.remove_blacklisted_annotations( association_set=self.go_associations, ontology=self.go_ontology, terms_blacklist=config.get_module_property( module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: logger.info("Loading DO associations from file") self.do_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.do_ontology) self.do_associations = self.remove_blacklisted_annotations( association_set=self.do_associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXP_AND_BIO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.EXPR: logger.info("Loading Expression associations from file") self.expression_associations = AssociationSetFactory( ).create_from_assocs(assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file(cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.expression_ontology) self.expression_associations = self.remove_blacklisted_annotations( association_set=self.expression_associations, ontology=self.expression_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def produce_gaf(dataset, source_gaf, ontology_graph, gpipath=None, paint=False, group="unknown"): filtered_associations = open(os.path.join(os.path.split(source_gaf)[0], "{}_noiea.gaf".format(dataset)), "w") config = assocparser.AssocParserConfig( ontology=ontology_graph, filter_out_evidence=["IEA"], filtered_evidence_file=filtered_associations, gpi_authority_path=gpipath, paint=paint ) validated_gaf_path = os.path.join(os.path.split(source_gaf)[0], "{}_valid.gaf".format(dataset)) outfile = open(validated_gaf_path, "w") gafwriter = GafWriter(file=outfile) click.echo("Validating source GAF: {}".format(source_gaf)) parser = GafParser(config=config, group=group, dataset=dataset) with open(source_gaf) as sg: lines = sum(1 for line in sg) with open(source_gaf) as gaf: with click.progressbar(iterable=parser.association_generator(file=gaf), length=lines) as associations: for assoc in associations: gafwriter.write_assoc(assoc) outfile.close() filtered_associations.close() with open(os.path.join(os.path.split(source_gaf)[0], "{}.report.md".format(dataset)), "w") as report_md: report_md.write(parser.report.to_markdown()) with open(os.path.join(os.path.split(source_gaf)[0], "{}.report.json".format(dataset)), "w") as report_json: report_json.write(json.dumps(parser.report.to_report_json(), indent=4)) return [validated_gaf_path, filtered_associations.name]
def load_associations(self, taxon) -> None: taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. # CX: These are 2 out of 3 top-level terms in GO ontology. # CX: The excluded term is cellular_component (where gene carries out a molecular function) go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[taxon] )
def test_errors_gaf(): p = GafParser() p.config.ecomap = EcoMap() assocs = p.parse(open("tests/resources/errors.gaf", "r")) msgs = p.report.messages print("MESSAGES: {}".format(len(msgs))) for m in msgs: print("MESSAGE: {}".format(m)) assert len(msgs) == 15 # we expect 4 assert len(assocs) == 7 from ontobio.io import GafWriter w = GafWriter() w.write(assocs) for a in assocs: if 'object_extensions' in a: # our test file has no ORs, so in DNF this is always the first xs = a['object_extensions']['union_of'][0]['intersection_of'] for x in xs: print('X: {}'.format(x)) # ensure that invalid expressions have been eliminated assert x['property'] == 'foo' assert x['filler'] == 'X:1' assert len(xs) == 1
def test_qualifiers_gaf(): # ont = OntologyFactory().create(ONT) p = GafParser() # p.config.ontology = ont assocs = p.parse(open(QGAF, "r"), skipheader=True) neg_assocs = [a for a in assocs if a['negated'] == True] assert len(neg_assocs) == 3 for a in assocs: print('REL: {}'.format(a['relation'])) assert len([a for a in assocs if a['relation']['id'] == 'contributes_to']) == 1 # For the space in `colocalizes with` assert len( list( filter( lambda e: e["obj"] == "colocalizes with", p.report.to_report_json()["messages"]["gorule-0000001"]))) == 1 assert len( list( filter( lambda e: e["obj"] == "involved_in", p.report.to_report_json()["messages"]["gorule-0000001"]))) == 1
def produce_ttl(dataset, target_dir, gaf_path, ontology_graph): gafparser = GafParser() gafparser.config = assocparser.AssocParserConfig(ontology=ontology_graph) with open(gaf_path) as sg: lines = sum(1 for line in sg) ttl_path = os.path.join( os.path.split(gaf_path)[0], "{}_cam.ttl".format(dataset)) click.echo("Producing ttl: {}".format(ttl_path)) rdf_writer = assoc_rdfgen.TurtleRdfWriter() transformer = assoc_rdfgen.CamRdfTransform(writer=rdf_writer) parser_config = assocparser.AssocParserConfig(ontology=ontology_graph) with open(gaf_path) as gf: with click.progressbar( iterable=gafparser.association_generator(file=gf), length=lines) as associations: for association in associations: if "header" not in association or not association["header"]: transformer.provenance() transformer.translate(association) with open(ttl_path, "wb") as ttl: click.echo("Writing ttl to disk") rdf_writer.serialize(destination=ttl) return ttl_path
def test_errors_gaf(): config = assocparser.AssocParserConfig(ecomap=EcoMap()) p = GafParser(config=config) assocs = p.parse(open("tests/resources/errors.gaf", "r"), skipheader=True) msgs = p.report.messages print(json.dumps(p.report.to_report_json(), indent=4)) # print("MESSAGES: {}".format(len(msgs))) n_invalid_idspace = 0 for m in msgs: print("MESSAGE: {}".format(m)) if m['type'] == assocparser.Report.INVALID_IDSPACE: n_invalid_idspace += 1 assert len(msgs) == 13 assert n_invalid_idspace == 1 assert len(assocs) == 2 w = GafWriter() w.write(assocs) for a in assocs: if a.object_extensions != []: # our test file has no ORs, so in DNF this is always the first xs = a.object_extensions[0].elements print(xs) for x in xs: print('X: {}'.format(x)) # ensure that invalid expressions have been eliminated assert x.relation == association.Curie("BFO", "0000050") assert x.term == association.Curie.from_str('X:1') assert len(xs) == 1
def test_errors_gaf(): config = assocparser.AssocParserConfig(ecomap=EcoMap()) p = GafParser(config=config) assocs = p.parse(open("tests/resources/errors.gaf", "r"), skipheader=True) msgs = p.report.messages print(json.dumps(p.report.to_report_json(), indent=4)) # print("MESSAGES: {}".format(len(msgs))) n_invalid_idspace = 0 for m in msgs: print("MESSAGE: {}".format(m)) if m['type'] == assocparser.Report.INVALID_IDSPACE: n_invalid_idspace += 1 assert len(msgs) == 16 assert n_invalid_idspace == 1 assert len(assocs) == 5 w = GafWriter() w.write(assocs) for a in assocs: if a['object_extensions'] != {}: # our test file has no ORs, so in DNF this is always the first xs = a['object_extensions']['union_of'][0]['intersection_of'] for x in xs: print('X: {}'.format(x)) # ensure that invalid expressions have been eliminated assert x['property'] == 'foo' assert x['filler'] == 'X:1' assert len(xs) == 1
def test_no_flag_valid_id(): ont = OntologyFactory().create(ONT) p = GafParser() p.config.ontology = ont p._validate_ontology_class_id( "GO:0000785", assocparser.SplitLine("fake", [""] * 17, taxon="foo")) assert len(p.report.messages) == 0
def test_bad_date(): p = GafParser() assoc_result = p.parse_line( "PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\tTODAY\tPomBase\tfoo(X:1)" ) assert assoc_result.skipped == True assert assoc_result.associations == []
def test_semsearch(): afa = AssociationSetFactory() f = POMBASE ont = OntologyFactory().create(ONT) parser = GafParser() assocs = parser.parse(POMBASE, skipheader=True) assocs = [a for a in assocs if a['subject']['label'] in GENES] aset = afa.create_from_assocs(assocs, ontology=ont) ont = aset.subontology() aset.ontology = ont logging.info('Genes={} Terms={}'.format(len(aset.subjects), len(ont.nodes()))) print('STATS={}'.format(aset.as_dataframe().describe())) #genes = aset.subjects[0:5] sse = SemSearchEngine(assocmodel=aset) logging.info('Calculating all MICAs') sse.calculate_all_micas() #h5path = 'tests/resources/mica_ic.h5' #logging.info('Saving to {}'.format(h5path)) #sse.mica_ic_df.to_hdf(h5path, key='mica_ic', mode='w') #logging.info('Saved to {}'.format(h5path)) logging.info('Doing pairwise') for i in aset.subjects: for j in aset.subjects: sim = sse.pw_score_cosine(i, j) #print('{} x {} = {}'.format(i,j,sim)) if i == j: assert (sim > 0.9999) tups = sse.pw_score_resnik_bestmatches(i, j) print('{} x {} = {} // {}'.format(i, j, sim, tups))
def retrieve_associations(self, ont, group): taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(ont) p = GafParser() url = '' if ont == 'go': go_roots = set( self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if group == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if group == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse('goa_human.gaf.gz') #assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs( assocs, ontology=sub_ont) else: self.associations = self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[group])
def create_from_file(self, file=None, fmt='gaf', skim=True, **args): """ Creates from a file. Arguments --------- file : str or file input file or filename format : str name of format e.g. gaf """ p = None if fmt == 'gaf': p = GafParser() elif fmt == 'gpad': p = GpadParser() elif fmt == 'hpoa': p = HpoaParser() else: logging.error("Format not recognized: {}".format(fmt)) logging.info("Parsing {} with {}/{}".format(file, fmt, p)) if skim: results = p.skim(file) return self.create_from_tuples(results, **args) else: assocs = p.parse(file, skipheader=True) return self.create_from_assocs(assocs, **args)
def produce_gpi(dataset, target_dir, gaf_path, ontology_graph): gafparser = GafParser() gafparser.config = assocparser.AssocParserConfig(ontology=ontology_graph) with open(gaf_path) as sg: lines = sum(1 for line in sg) gpi_path = os.path.join( os.path.split(gaf_path)[0], "{}.gpi".format(dataset)) with open(gaf_path) as gf, open(gpi_path, "w") as gpi: click.echo("Using {} as the gaf to build gpi with".format(gaf_path)) bridge = gafgpibridge.GafGpiBridge() gpiwriter = entitywriter.GpiWriter(file=gpi) gpi_cache = set() with click.progressbar( iterable=gafparser.association_generator(file=gf), length=lines) as associations: for association in associations: entity = bridge.convert_association(association) if entity not in gpi_cache and entity is not None: # If the entity is not in the cache, add it and write it out gpi_cache.add(entity) gpiwriter.write_entity(entity) return gpi_path
def test_default_gaf_version(): p = GafParser() assocs = p.parse(open("tests/resources/test-qualifiers-no-version.gaf"), skipheader=True) assert p.version == "2.1"
def test_subject_extensions(): p = GafParser() assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tfoo(X:1)\tUniProtKB:P12345") print(json.dumps(assoc_result.associations[0], indent=4)) assert "subject_extensions" in assoc_result.associations[0] subject_extensions = assoc_result.associations[0]['subject_extensions'] gene_product_form_id = [extension["filler"] for extension in subject_extensions if extension["property"] == "isoform"][0] assert gene_product_form_id == "UniProtKB:P12345"
def test_subject_extensions(): p = GafParser() assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tpart_of(X:1)\tUniProtKB:P12345") assert len(assoc_result.associations[0].subject_extensions) == 1 subject_extensions = assoc_result.associations[0].subject_extensions gene_product_form_id = subject_extensions[0].term assert gene_product_form_id == association.Curie.from_str("UniProtKB:P12345")
def test_validate_go_idspaces(): ont = OntologyFactory().create(ONT) p = GafParser() p.config.class_idspaces = ['FOOZ'] assocs = p.parse(open(POMBASE, "r"), skipheader=True) for m in p.report.messages: print("MESSAGE: {}".format(m)) assert len(assocs) == 0 assert len(p.report.messages) > 1 summary = p.report.to_report_json() assert summary['associations'] == 0 assert summary['lines'] > 300 print(p.report.to_markdown()) # ensure config is not preserved p = GafParser() assert p.config.class_idspaces == None
def test_obsolete_replair_of_withfrom(): p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create(OBSOLETE_ONT))) assocs = p.parse(open(ZFIN_GAF, "r"), skipheader=True) assert assocs[0].evidence.with_support_from == [ ConjunctiveSet(elements=[Curie(namespace='GO', identity='0005912')]) ] # Reset parser report p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create(OBSOLETE_ONT))) p.version = "2.2" obsolete_no_replacement_line = "FB\tFBgn0003334\tScm\tlocated_in\tGO:0005634\tFB:FBrf0179383|PMID:15280237\tIC\tGO:0016458\tC\tSex comb on midleg\tCG9495|SCM|Sex Comb on Midleg|Sex Comb on the Midleg|Sex combs on midleg|Sex combs on midlegs|Su(z)302|l(3)85Ef|scm|sex comb on midleg\tprotein\ttaxon:7227\t20050203\tUniProt\t\t" assoc_result = p.parse_line(obsolete_no_replacement_line) assert assoc_result.associations == [] assert p.report.to_report_json( )["messages"]["gorule-0000020"][0]["obj"] == "GO:0016458"
def test_gaf_2_1_upconvert_in_parse(): gaf = io.StringIO("!gaf-version: 2.1\nSGD\tS000000819\tAFG3\t\tGO:0005840\tPMID:8681382|SGD_REF:S000055187\tIMP\t\tP\tMitochondrial inner membrane m-AAA protease component\tYER017C|AAA family ATPase AFG3|YTA10\tgene\ttaxon:559292\t20170428\tSGD") ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) # We're 2.1, qualifier blank, cell component term from above, ontology defined: should upgrade assocs = p.parse(gaf, skipheader=True) assert assocs[0].relation == association.Curie(namespace="BFO", identity="0000050")
def test_one_line(): p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create( "tests/resources/goslim_generic.json"))) parsed = p.parse_line( "PomBase SPBC16D10.09 pcn1 GO:0009536 PMID:8663159 IDA C PCNA pcn protein taxon:4896 20150326 PomBase" )
def test_bad_withfrom(): p = GafParser() # With/from has no identity portion after the namespace assoc_result = p.parse_line( "PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase" ) assert assoc_result.associations == [] assert p.report.to_report_json( )["messages"]["gorule-0000001"][0]["obj"] == "SGD:"
def test_errors_gaf(): p = GafParser() p.config.ecomap = EcoMap() assocs = p.parse(open("tests/resources/errors.gaf", "r")) msgs = p.report.messages print("MESSAGES: {}".format(len(msgs))) for m in msgs: print("MESSAGE: {}".format(m)) assert len(msgs) == 8
def test_alt_id_repair(): p = GafParser() ont = OntologyFactory().create(ALT_ID_ONT) p.config.ecomap = EcoMap() p.config.ontology = ont gaf = io.StringIO("SGD\tS000000819\tAFG3\t\tGO:0043623\tPMID:8681382|SGD_REF:S000055187\tIMP\t\tP\tMitochondrial inner membrane m-AAA protease component\tYER017C|AAA family ATPase AFG3|YTA10\tgene\ttaxon:559292\t20170428\tSGD") assocs = p.parse(gaf, skipheader=True) assert len(assocs) > 0 assert assocs[0]["object"]["id"] == "GO:0043623"
def create_from_remote_file(self, group, snapshot=True, **args): """ Creates from remote GAF """ import requests url = "http://snapshot.geneontology.org/annotations/{}.gaf.gz".format(group) r = requests.get(url, stream=True, headers={'User-Agent': get_user_agent(modules=[requests], caller_name=__name__)}) p = GafParser() results = p.skim(r.raw) return self.create_from_tuples(results, **args)
def create_from_remote_file(self, group, snapshot=True, **args): """ Creates from remote GAF """ import requests url = "http://snapshot.geneontology.org/annotations/{}.gaf.gz".format(group) r = requests.get(url, stream=True) p = GafParser() results = p.skim(r.raw) return self.create_from_tuples(results, **args)
def test_skim_gaf(): p = GafParser() p.config.ecomap = EcoMap() results = p.skim(open(POMBASE, "r")) assert len(results) == 370 for r in results: print(str(r)) (s, sn, o) = r assert o.startswith('GO:') assert s.startswith('PomBase:')
def test_object_extensions(): p = GafParser() assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tpart_of(X:1)\tUniProtKB:P12345") print(p.report.to_markdown()) assert len(assoc_result.associations[0].object_extensions) > 0 object_extensions = [ association.ConjunctiveSet([ association.ExtensionUnit(association.Curie("BFO", "0000050"), association.Curie("X", "1")) ]) ] assert assoc_result.associations[0].object_extensions == object_extensions
def load_associations(self, group): p = GafParser() afactory = AssociationSetFactory() url = "http://geneontology.org/gene-associations/gene_association.{}.gz".format( group) if group == 'human': url = "http://geneontology.org/gene-associations/goa_human.gaf.gz" assocs = p.parse(url) assocs = [x for x in assocs if 'header' not in x.keys()] self.associations = afactory.create_from_assocs(assocs, ontology=self.ontology)