def test_semsearch(): afa = AssociationSetFactory() f = POMBASE ont = OntologyFactory().create(ONT) parser = GafParser() assocs = parser.parse(POMBASE, skipheader=True) assocs = [a for a in assocs if a['subject']['label'] in GENES] aset = afa.create_from_assocs(assocs, ontology=ont) ont = aset.subontology() aset.ontology = ont logging.info('Genes={} Terms={}'.format(len(aset.subjects), len(ont.nodes()))) print('STATS={}'.format(aset.as_dataframe().describe())) #genes = aset.subjects[0:5] sse = SemSearchEngine(assocmodel=aset) logging.info('Calculating all MICAs') sse.calculate_all_micas() #h5path = 'tests/resources/mica_ic.h5' #logging.info('Saving to {}'.format(h5path)) #sse.mica_ic_df.to_hdf(h5path, key='mica_ic', mode='w') #logging.info('Saved to {}'.format(h5path)) logging.info('Doing pairwise') for i in aset.subjects: for j in aset.subjects: sim = sse.pw_score_cosine(i, j) #print('{} x {} = {}'.format(i,j,sim)) if i == j: assert (sim > 0.9999) tups = sse.pw_score_resnik_bestmatches(i, j) print('{} x {} = {} // {}'.format(i, j, sim, tups))
def test_default_gaf_version(): p = GafParser() assocs = p.parse(open("tests/resources/test-qualifiers-no-version.gaf"), skipheader=True) assert p.version == "2.1"
def test_qualifiers_gaf(): # ont = OntologyFactory().create(ONT) p = GafParser() # p.config.ontology = ont assocs = p.parse(open(QGAF, "r"), skipheader=True) neg_assocs = [a for a in assocs if a['negated'] == True] assert len(neg_assocs) == 3 for a in assocs: print('REL: {}'.format(a['relation'])) assert len([a for a in assocs if a['relation']['id'] == 'contributes_to']) == 1 # For the space in `colocalizes with` assert len( list( filter( lambda e: e["obj"] == "colocalizes with", p.report.to_report_json()["messages"]["gorule-0000001"]))) == 1 assert len( list( filter( lambda e: e["obj"] == "involved_in", p.report.to_report_json()["messages"]["gorule-0000001"]))) == 1
def test_errors_gaf(): config = assocparser.AssocParserConfig(ecomap=EcoMap()) p = GafParser(config=config) assocs = p.parse(open("tests/resources/errors.gaf", "r"), skipheader=True) msgs = p.report.messages print(json.dumps(p.report.to_report_json(), indent=4)) # print("MESSAGES: {}".format(len(msgs))) n_invalid_idspace = 0 for m in msgs: print("MESSAGE: {}".format(m)) if m['type'] == assocparser.Report.INVALID_IDSPACE: n_invalid_idspace += 1 assert len(msgs) == 16 assert n_invalid_idspace == 1 assert len(assocs) == 5 w = GafWriter() w.write(assocs) for a in assocs: if a['object_extensions'] != {}: # our test file has no ORs, so in DNF this is always the first xs = a['object_extensions']['union_of'][0]['intersection_of'] for x in xs: print('X: {}'.format(x)) # ensure that invalid expressions have been eliminated assert x['property'] == 'foo' assert x['filler'] == 'X:1' assert len(xs) == 1
def test_no_flag_valid_id(): ont = OntologyFactory().create(ONT) p = GafParser() p.config.ontology = ont p._validate_ontology_class_id( "GO:0000785", assocparser.SplitLine("fake", [""] * 17, taxon="foo")) assert len(p.report.messages) == 0
def retrieve_associations(self, ont, group): taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(ont) p = GafParser() url = '' if ont == 'go': go_roots = set( self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if group == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if group == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse('goa_human.gaf.gz') #assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs( assocs, ontology=sub_ont) else: self.associations = self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[group])
def produce_gaf(dataset, source_gaf, ontology_graph, gpipath=None, paint=False, group="unknown"): filtered_associations = open(os.path.join(os.path.split(source_gaf)[0], "{}_noiea.gaf".format(dataset)), "w") config = assocparser.AssocParserConfig( ontology=ontology_graph, filter_out_evidence=["IEA"], filtered_evidence_file=filtered_associations, gpi_authority_path=gpipath, paint=paint ) validated_gaf_path = os.path.join(os.path.split(source_gaf)[0], "{}_valid.gaf".format(dataset)) outfile = open(validated_gaf_path, "w") gafwriter = GafWriter(file=outfile) click.echo("Validating source GAF: {}".format(source_gaf)) parser = GafParser(config=config, group=group, dataset=dataset) with open(source_gaf) as sg: lines = sum(1 for line in sg) with open(source_gaf) as gaf: with click.progressbar(iterable=parser.association_generator(file=gaf), length=lines) as associations: for assoc in associations: gafwriter.write_assoc(assoc) outfile.close() filtered_associations.close() with open(os.path.join(os.path.split(source_gaf)[0], "{}.report.md".format(dataset)), "w") as report_md: report_md.write(parser.report.to_markdown()) with open(os.path.join(os.path.split(source_gaf)[0], "{}.report.json".format(dataset)), "w") as report_json: report_json.write(json.dumps(parser.report.to_report_json(), indent=4)) return [validated_gaf_path, filtered_associations.name]
def test_errors_gaf(): config = assocparser.AssocParserConfig(ecomap=EcoMap()) p = GafParser(config=config) assocs = p.parse(open("tests/resources/errors.gaf", "r"), skipheader=True) msgs = p.report.messages print(json.dumps(p.report.to_report_json(), indent=4)) # print("MESSAGES: {}".format(len(msgs))) n_invalid_idspace = 0 for m in msgs: print("MESSAGE: {}".format(m)) if m['type'] == assocparser.Report.INVALID_IDSPACE: n_invalid_idspace += 1 assert len(msgs) == 13 assert n_invalid_idspace == 1 assert len(assocs) == 2 w = GafWriter() w.write(assocs) for a in assocs: if a.object_extensions != []: # our test file has no ORs, so in DNF this is always the first xs = a.object_extensions[0].elements print(xs) for x in xs: print('X: {}'.format(x)) # ensure that invalid expressions have been eliminated assert x.relation == association.Curie("BFO", "0000050") assert x.term == association.Curie.from_str('X:1') assert len(xs) == 1
def produce_gpi(dataset, target_dir, gaf_path, ontology_graph): gafparser = GafParser() gafparser.config = assocparser.AssocParserConfig(ontology=ontology_graph) with open(gaf_path) as sg: lines = sum(1 for line in sg) gpi_path = os.path.join( os.path.split(gaf_path)[0], "{}.gpi".format(dataset)) with open(gaf_path) as gf, open(gpi_path, "w") as gpi: click.echo("Using {} as the gaf to build gpi with".format(gaf_path)) bridge = gafgpibridge.GafGpiBridge() gpiwriter = entitywriter.GpiWriter(file=gpi) gpi_cache = set() with click.progressbar( iterable=gafparser.association_generator(file=gf), length=lines) as associations: for association in associations: entity = bridge.convert_association(association) if entity not in gpi_cache and entity is not None: # If the entity is not in the cache, add it and write it out gpi_cache.add(entity) gpiwriter.write_entity(entity) return gpi_path
def load_associations(self, taxon) -> None: taxon_map = { 'human': 'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090', } ofactory = OntologyFactory() self.ontology = ofactory.create(self.ont) p = GafParser() url = '' if self.ont == 'go': # CX: GO:0008150 is biological_process, GO:0003674 is molecular_function. # CX: These are 2 out of 3 top-level terms in GO ontology. # CX: The excluded term is cellular_component (where gene carries out a molecular function) go_roots = set(self.ontology.descendants('GO:0008150') + self.ontology.descendants('GO:0003674')) sub_ont = self.ontology.subontology(go_roots) if taxon == 'mouse': url = "http://current.geneontology.org/annotations/mgi.gaf.gz" if taxon == 'human': url = "http://current.geneontology.org/annotations/goa_human.gaf.gz" assocs = p.parse(url) self.assocs = assocs assocs = [x for x in assocs if 'header' not in x.keys()] assocs = [x for x in assocs if x['object']['id'] in go_roots] self.associations = self.afactory.create_from_assocs(assocs, ontology=sub_ont) else: self.associations = \ self.afactory.create( ontology=self.ontology, subject_category='gene', object_category='phenotype', taxon=taxon_map[taxon] )
def test_errors_gaf(): p = GafParser() p.config.ecomap = EcoMap() assocs = p.parse(open("tests/resources/errors.gaf", "r")) msgs = p.report.messages print("MESSAGES: {}".format(len(msgs))) for m in msgs: print("MESSAGE: {}".format(m)) assert len(msgs) == 15 # we expect 4 assert len(assocs) == 7 from ontobio.io import GafWriter w = GafWriter() w.write(assocs) for a in assocs: if 'object_extensions' in a: # our test file has no ORs, so in DNF this is always the first xs = a['object_extensions']['union_of'][0]['intersection_of'] for x in xs: print('X: {}'.format(x)) # ensure that invalid expressions have been eliminated assert x['property'] == 'foo' assert x['filler'] == 'X:1' assert len(xs) == 1
def test_bad_date(): p = GafParser() assoc_result = p.parse_line( "PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\tTODAY\tPomBase\tfoo(X:1)" ) assert assoc_result.skipped == True assert assoc_result.associations == []
def produce_ttl(dataset, target_dir, gaf_path, ontology_graph): gafparser = GafParser() gafparser.config = assocparser.AssocParserConfig(ontology=ontology_graph) with open(gaf_path) as sg: lines = sum(1 for line in sg) ttl_path = os.path.join( os.path.split(gaf_path)[0], "{}_cam.ttl".format(dataset)) click.echo("Producing ttl: {}".format(ttl_path)) rdf_writer = assoc_rdfgen.TurtleRdfWriter() transformer = assoc_rdfgen.CamRdfTransform(writer=rdf_writer) parser_config = assocparser.AssocParserConfig(ontology=ontology_graph) with open(gaf_path) as gf: with click.progressbar( iterable=gafparser.association_generator(file=gf), length=lines) as associations: for association in associations: if "header" not in association or not association["header"]: transformer.provenance() transformer.translate(association) with open(ttl_path, "wb") as ttl: click.echo("Writing ttl to disk") rdf_writer.serialize(destination=ttl) return ttl_path
def test_subject_extensions(): p = GafParser() assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tfoo(X:1)\tUniProtKB:P12345") print(json.dumps(assoc_result.associations[0], indent=4)) assert "subject_extensions" in assoc_result.associations[0] subject_extensions = assoc_result.associations[0]['subject_extensions'] gene_product_form_id = [extension["filler"] for extension in subject_extensions if extension["property"] == "isoform"][0] assert gene_product_form_id == "UniProtKB:P12345"
def test_one_line(): p = GafParser(config=assocparser.AssocParserConfig( ontology=OntologyFactory().create( "tests/resources/goslim_generic.json"))) parsed = p.parse_line( "PomBase SPBC16D10.09 pcn1 GO:0009536 PMID:8663159 IDA C PCNA pcn protein taxon:4896 20150326 PomBase" )
def test_gaf_2_1_upconvert_in_parse(): gaf = io.StringIO("!gaf-version: 2.1\nSGD\tS000000819\tAFG3\t\tGO:0005840\tPMID:8681382|SGD_REF:S000055187\tIMP\t\tP\tMitochondrial inner membrane m-AAA protease component\tYER017C|AAA family ATPase AFG3|YTA10\tgene\ttaxon:559292\t20170428\tSGD") ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) # We're 2.1, qualifier blank, cell component term from above, ontology defined: should upgrade assocs = p.parse(gaf, skipheader=True) assert assocs[0].relation == association.Curie(namespace="BFO", identity="0000050")
def test_subject_extensions(): p = GafParser() assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tpart_of(X:1)\tUniProtKB:P12345") assert len(assoc_result.associations[0].subject_extensions) == 1 subject_extensions = assoc_result.associations[0].subject_extensions gene_product_form_id = subject_extensions[0].term assert gene_product_form_id == association.Curie.from_str("UniProtKB:P12345")
def test_errors_gaf(): p = GafParser() p.config.ecomap = EcoMap() assocs = p.parse(open("tests/resources/errors.gaf", "r")) msgs = p.report.messages print("MESSAGES: {}".format(len(msgs))) for m in msgs: print("MESSAGE: {}".format(m)) assert len(msgs) == 8
def test_bad_withfrom(): p = GafParser() # With/from has no identity portion after the namespace assoc_result = p.parse_line( "PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase" ) assert assoc_result.associations == [] assert p.report.to_report_json( )["messages"]["gorule-0000001"][0]["obj"] == "SGD:"
def load_associations_from_file(self, associations_type: DataType, associations_url: str, associations_cache_path: str, config: GenedescConfigParser) -> None: """load go associations from file Args: associations_type (DataType): the type of associations to set associations_url (str): url to the association file associations_cache_path (str): path to cache file for the associations config (GenedescConfigParser): configuration object where to read properties """ assoc_config = AssocParserConfig(remove_double_prefixes=True, paint=True) if associations_type == DataType.GO: logger.info("Loading GO associations from file") self.go_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.go_ontology) self.go_associations = self.remove_blacklisted_annotations( association_set=self.go_associations, ontology=self.go_ontology, terms_blacklist=config.get_module_property( module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: logger.info("Loading DO associations from file") self.do_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.do_ontology) self.do_associations = self.remove_blacklisted_annotations( association_set=self.do_associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXP_AND_BIO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.EXPR: logger.info("Loading Expression associations from file") self.expression_associations = AssociationSetFactory( ).create_from_assocs(assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file(cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.expression_ontology) self.expression_associations = self.remove_blacklisted_annotations( association_set=self.expression_associations, ontology=self.expression_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def test_skim_gaf(): p = GafParser() p.config.ecomap = EcoMap() results = p.skim(open(POMBASE, "r")) assert len(results) == 370 for r in results: print(str(r)) (s, sn, o) = r assert o.startswith('GO:') assert s.startswith('PomBase:')
def test_alt_id_repair(): p = GafParser() ont = OntologyFactory().create(ALT_ID_ONT) p.config.ecomap = EcoMap() p.config.ontology = ont gaf = io.StringIO("SGD\tS000000819\tAFG3\t\tGO:0043623\tPMID:8681382|SGD_REF:S000055187\tIMP\t\tP\tMitochondrial inner membrane m-AAA protease component\tYER017C|AAA family ATPase AFG3|YTA10\tgene\ttaxon:559292\t20170428\tSGD") assocs = p.parse(gaf, skipheader=True) assert len(assocs) > 0 assert assocs[0]["object"]["id"] == "GO:0043623"
def create_from_remote_file(self, group, snapshot=True, **args): """ Creates from remote GAF """ import requests url = "http://snapshot.geneontology.org/annotations/{}.gaf.gz".format(group) r = requests.get(url, stream=True) p = GafParser() results = p.skim(r.raw) return self.create_from_tuples(results, **args)
def create_from_remote_file(self, group, snapshot=True, **args): """ Creates from remote GAF """ import requests url = "http://snapshot.geneontology.org/annotations/{}.gaf.gz".format(group) r = requests.get(url, stream=True, headers={'User-Agent': get_user_agent(modules=[requests], caller_name=__name__)}) p = GafParser() results = p.skim(r.raw) return self.create_from_tuples(results, **args)
def make_products(dataset, target_dir, gaf_path, products, ontology_graph): gafparser = GafParser() gafparser.config = assocparser.AssocParserConfig( ontology=ontology_graph, paint=True ) with open(gaf_path) as sg: lines = sum(1 for line in sg) product_files = { "gpad": open(os.path.join(os.path.split(gaf_path)[0], "{}.gpad".format(dataset)), "w"), "ttl": open(os.path.join(os.path.split(gaf_path)[0], "{}_cam.ttl".format(dataset)), "wb") } if not products["gpad"] and not products["ttl"]: # Bail if we have no products return [] # def write_gpi_entity(association, bridge, gpiwriter): with open(gaf_path) as gf: # gpi info: click.echo("Using {} as the gaf to build data products with".format(gaf_path)) if products["ttl"]: click.echo("Setting up {}".format(product_files["ttl"].name)) rdf_writer = assoc_rdfgen.TurtleRdfWriter(label=os.path.split(product_files["ttl"].name)[1] ) transformer = assoc_rdfgen.CamRdfTransform(writer=rdf_writer) parser_config = assocparser.AssocParserConfig(ontology=ontology_graph) if products["gpad"]: click.echo("Setting up {}".format(product_files["gpad"].name)) gpadwriter = GpadWriter(file=product_files["gpad"]) click.echo("Making products...") with click.progressbar(iterable=gafparser.association_generator(file=gf), length=lines) as associations: for association in associations: if products["ttl"]: if "header" not in association or not association["header"]: transformer.provenance() transformer.translate(association) if products["gpad"]: gpadwriter.write_assoc(association) # post ttl steps if products["ttl"]: click.echo("Writing ttl to disk") rdf_writer.serialize(destination=product_files["ttl"]) # After we run through associations for f in product_files.values(): f.close() return [product_files[prod].name for prod in sorted(product_files.keys()) if products[prod]]
def test_qualifiers_gaf_2_2(): p = GafParser() assocs = p.parse(open("tests/resources/test-qualifiers-2.2.gaf"), skipheader=True) # NOT by itself is not allowed assert len(list(filter(lambda e: e["obj"] == "NOT", p.report.to_report_json()["messages"]["gorule-0000001"]))) == 1 assert len(list(filter(lambda e: e["obj"] == "contributes_to|enables", p.report.to_report_json()["messages"]["gorule-0000001"]))) == 1 assert len([a for a in assocs if association.Curie.from_str("RO:0004035") in a.qualifiers]) == 1
def test_upgrade_qualifiers_for_cell_component(): line = ["SGD", "S000000819", "AFG3", "", "GO:0008372", "PMID:8681382|SGD_REF:S000055187", "IMP", "", "P", "Mitochondrial inner membrane m-AAA protease component", "YER017C|AAA family ATPase AFG3|YTA10", "gene", "taxon:559292", "20170428", "SGD"] ontology = OntologyFactory().create("tests/resources/goslim_generic.json") p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) p.make_internal_cell_component_closure() parsed = gafparser.to_association(line) assoc = p.upgrade_empty_qualifier(parsed.associations[0]) assert assoc.qualifiers[0] == association.Curie(namespace="RO", identity="0002432")
def test_object_extensions(): p = GafParser() assoc_result = p.parse_line("PomBase\tSPAC25B8.17\typf1\t\tGO:0000007\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20181024\tPomBase\tpart_of(X:1)\tUniProtKB:P12345") print(p.report.to_markdown()) assert len(assoc_result.associations[0].object_extensions) > 0 object_extensions = [ association.ConjunctiveSet([ association.ExtensionUnit(association.Curie("BFO", "0000050"), association.Curie("X", "1")) ]) ] assert assoc_result.associations[0].object_extensions == object_extensions
def load_associations(self, group): p = GafParser() afactory = AssociationSetFactory() url = "http://geneontology.org/gene-associations/gene_association.{}.gz".format( group) if group == 'human': url = "http://geneontology.org/gene-associations/goa_human.gaf.gz" assocs = p.parse(url) assocs = [x for x in assocs if 'header' not in x.keys()] self.associations = afactory.create_from_assocs(assocs, ontology=self.ontology)
def test_gaf_2_1_creates_cell_component_closure(): ontology = OntologyFactory().create("tests/resources/goslim_generic.json") closure = gafparser.protein_complex_sublcass_closure(ontology) # "GO:1902494" as an example that should be in the set assert "GO:0005840" in closure p = GafParser(config=assocparser.AssocParserConfig(ontology=ontology)) with open("tests/resources/pombase_single.gaf") as gaf: # First line will be version declaration, triggering closure computation p.parse_line(gaf.readline()) assert "GO:0005840" in p.cell_component_descendants_closure
def parse(self, limit=None): """ Override Source.parse() Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) rgd_file = '/'.join( (self.rawdir, self.files['rat_gene2mammalian_phenotype']['file'])) # ontobio gafparser implemented here p = GafParser() assocs = p.parse(open(rgd_file, "r")) for i, assoc in enumerate(assocs): if 'relation' in assoc.keys(): self.make_association(assoc) if limit is not None and i > limit: break return