def test_gaf_to_gpad2(): line = "PomBase\tSPAC25B8.17\typf1\t\tGO:0000006\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:999|taxon:888\t20150305\tPomBase\tpart_of(X:1)\tUniProtKB:P12345" parser = gafparser.GafParser() out = io.StringIO() writer = assocwriter.GpadWriter(version=assocwriter.GPAD_2_0, file=out) assoc = parser.parse_line(line).associations[0] writer.write_assoc(assoc) lines = out.getvalue().split("\n") assert lines[0] == "!gpa-version: 2.0" assert lines[ 1] == "PomBase:SPAC25B8.17\t\tBFO:0000050\tGO:0000006\tGO_REF:0000024\tECO:0000266\tSGD:S000001583\tNCBITaxon:888\t2015-03-05\tPomBase\tBFO:0000050(X:1)\t" line = "PomBase\tSPAC25B8.17\typf1\tNOT\tGO:0000006\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:999|taxon:888\t20150305\tPomBase\tpart_of(X:1)\tUniProtKB:P12345" parser = gafparser.GafParser() out = io.StringIO() writer = assocwriter.GpadWriter(version=assocwriter.GPAD_2_0, file=out) assoc = parser.parse_line(line).associations[0] writer.write_assoc(assoc) lines = out.getvalue().split("\n") assert lines[0] == "!gpa-version: 2.0" assert lines[ 1] == "PomBase:SPAC25B8.17\tNOT\tBFO:0000050\tGO:0000006\tGO_REF:0000024\tECO:0000266\tSGD:S000001583\tNCBITaxon:888\t2015-03-05\tPomBase\tBFO:0000050(X:1)\t"
def test_roundtrip(): """ Start with a line, parse it, then write it. The beginning line should be the same as what was written. """ line = "PomBase\tSPAC25B8.17\typf1\t\tGO:0000006\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:999|taxon:888\t20150305\tPomBase\tfoo(X:1)\tUniProtKB:P12345" parser = gafparser.GafParser() out = io.StringIO() writer = assocwriter.GafWriter(file=out) assoc_dict = parser.parse_line(line).associations[0] writer.write_assoc(assoc_dict) gaf = [ line for line in out.getvalue().split("\n") if not line.startswith("!") ][0] assert line == gaf # Single taxon line = "PomBase\tSPAC25B8.17\typf1\t\tGO:0000006\tGO_REF:0000024\tISO\tSGD:S000001583\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:1111\t20150305\tPomBase\tfoo(X:1)\tUniProtKB:P12345" parser = gafparser.GafParser() out = io.StringIO() writer = assocwriter.GafWriter(file=out) assoc_dict = parser.parse_line(line).associations[0] writer.write_assoc(assoc_dict) gaf = [ line for line in out.getvalue().split("\n") if not line.startswith("!") ][0] assert line == gaf
def test_gaf2_2_qualifier_to_gaf2_1(): # Qualifier is `part_of` and should be returned blank instead of removing the whole line line = "WB\tWBGene00000001\taap-1\tinvolved_in\tGO:0008286\tWB_REF:WBPaper00005614|PMID:12393910\tIMP\t\tP\t\tY110A7A.10\tgene\ttaxon:6239\t20060302\tWB\t\t" parser = gafparser.GafParser() parser.version = "2.2" out = io.StringIO() writer = assocwriter.GafWriter(file=out, version="2.1") # Write out to gaf 2.1 assoc = parser.parse_line(line).associations[0] writer.write_assoc(assoc) gpad_to_gaf_line = [ line for line in out.getvalue().split("\n") if not line.startswith("!") ][0] assert gpad_to_gaf_line.split("\t")[3] == "" # Test with a `NOT` line = "WB\tWBGene00000001\taap-1\tNOT|involved_in\tGO:0008286\tWB_REF:WBPaper00005614|PMID:12393910\tIMP\t\tP\t\tY110A7A.10\tgene\ttaxon:6239\t20060302\tWB\t\t" parser = gafparser.GafParser() parser.version = "2.2" out = io.StringIO() writer = assocwriter.GafWriter(file=out, version="2.1") # Write out to gaf 2.1 assoc = parser.parse_line(line).associations[0] writer.write_assoc(assoc) gpad_to_gaf_line = [ line for line in out.getvalue().split("\n") if not line.startswith("!") ][0] assert gpad_to_gaf_line.split("\t")[3] == "NOT"
def test_no_colon_in_id(): parser = gafparser.GafParser() valid = parser._validate_id("FOOBAR", "") assert not valid assert len(parser.report.messages) == 1 assert parser.report.messages[0]["level"] == assocparser.Report.ERROR
def test_validate_pipe_separated(): parser = gafparser.GafParser() ids = parser.validate_pipe_separated_ids("PMID:12345", "") assert set(ids) == set(["PMID:12345"]) ids = parser.validate_pipe_separated_ids("PMID:12345|PMID:11111", "") assert set(ids) == set(["PMID:12345", "PMID:11111"])
def test_empty_pre_colon(): parser = gafparser.GafParser() valid = parser._validate_id(":123", "") assert not valid assert len(parser.report.messages) == 1 assert parser.report.messages[0]["level"] == assocparser.Report.ERROR
def create_parser_from_header( line: str, config: assocparser.AssocParserConfig, group="unknown", dataset="unknown", bio_entities=None) -> Optional[assocparser.AssocParser]: parser = None parsed_version = parser_version_regex.findall(line) if len(parsed_version) == 1: filetype, version, _ = parsed_version[0] if filetype in ["gpad", "gpa"]: parser = gpadparser.GpadParser(config=config, bio_entities=bio_entities, group=group, dataset=dataset) if version in ["1.2", "2.0"]: parser.version = version elif filetype == "gaf": parser = gafparser.GafParser(config=config, bio_entities=bio_entities, group=group, dataset=dataset) if version in ["2.1", "2.2"]: parser.version = version return parser
def test_pipe_in_id(): parser = gafparser.GafParser() valid = parser._validate_id("F|OO:123", "") assert valid assert len(parser.report.messages) == 1 assert parser.report.messages[0]["level"] == assocparser.Report.WARNING
def test_bad_character_in_id(): parser = gafparser.GafParser() valid = parser._validate_id("FOO:1&23", "") assert not valid assert len(parser.report.messages) == 1 assert parser.report.messages[0]["level"] == assocparser.Report.ERROR
def test_validate_with_allowed_ids(): parser = gafparser.GafParser() valid = parser._validate_id("FOO:123", assocparser.SplitLine("", [""] * 17, "taxon:foo"), allowed_ids=["FOO"]) assert valid
def test_empty_id(): parser = gafparser.GafParser() valid = parser._validate_id( "", assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert not valid assert len(parser.report.messages) == 1 assert parser.report.messages[0]["level"] == assocparser.Report.ERROR
def test_pipe_in_id(): parser = gafparser.GafParser() valid = parser._validate_id( "F|OO:123", assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert valid assert len(parser.report.messages) == 1 assert parser.report.messages[0]["level"] == assocparser.Report.WARNING
def test_normalize_refs_good_and_bad_refs(): parser = gafparser.GafParser() refs = parser.normalize_refs(["FB:123", "PMID:234"], assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert len(parser.report.messages) == 1 assert parser.report.messages[0][ "type"] == assocparser.Report.INVALID_IDSPACE
def test_validate_pipe_separated_empty_allowed(): parser = gafparser.GafParser() ids = parser.validate_pipe_separated_ids("", assocparser.SplitLine( "", [""] * 17, "taxon:foo"), empty_allowed=True) assert ids == []
def test_gaf_2_2_extensions(): line = "WB\tWBGene00000001\taap-1\tinvolved_in\tGO:0008286\tWB_REF:WBPaper00005614|PMID:12393910\tIMP\t\tP\t\tY110A7A.10\tgene\ttaxon:6239\t20060302\tWB\tpart_of(EMAPA:17972),part_of(CL:0000018)\t" parser = gafparser.GafParser() parser.version = "2.2" assoc = parser.parse_line(line).associations[0] gaf_22_out = assoc.to_gaf_2_2_tsv() assert gaf_22_out[15] == "part_of(EMAPA:17972),part_of(CL:0000018)"
def test_validate_with_disallowed_id(): parser = gafparser.GafParser() valid = parser._validate_id("FOO:123", assocparser.SplitLine("", [""] * 17, "taxon:foo"), allowed_ids=["BAR"]) assert len(parser.report.messages) == 1 assert parser.report.messages[0]["level"] == assocparser.Report.WARNING
def test_validate_pipe_separated_with_bad_ids(): parser = gafparser.GafParser() ids = parser.validate_pipe_separated_ids("PMID:123[2]|PMID:11111", "") assert ids == None ids = parser.validate_pipe_separated_ids("PMID:123[2]", "") assert ids == None
def test_writing_to_gaf_2_2(): line = "WB\tWBGene00000001\taap-1\tinvolved_in\tGO:0008286\tWB_REF:WBPaper00005614|PMID:12393910\tIMP\t\tP\t\tY110A7A.10\tgene\ttaxon:6239\t20060302\tWB\t\t" parser = gafparser.GafParser() parser.version = "2.2" assoc = parser.parse_line(line).associations[0] # type: GoAssociation gaf_22_out = assoc.to_gaf_2_2_tsv() assert gaf_22_out[3] == "involved_in" # With NOT line = "WB\tWBGene00000001\taap-1\tNOT|involved_in\tGO:0008286\tWB_REF:WBPaper00005614|PMID:12393910\tIMP\t\tP\t\tY110A7A.10\tgene\ttaxon:6239\t20060302\tWB\t\t" parser = gafparser.GafParser() parser.version = "2.2" assoc = parser.parse_line(line).associations[0] # type: GoAssociation gaf_22_out = assoc.to_gaf_2_2_tsv() assert gaf_22_out[3] == "NOT|involved_in"
def test_normalize_refs_single_bad_ref(): parser = gafparser.GafParser() ref = parser.normalize_refs(["FB:123"], assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert ref == ["FB:123"] assert len(parser.report.messages) == 1 assert parser.report.messages[0][ "type"] == assocparser.Report.INVALID_IDSPACE
def test_validate_pipe_separated(): parser = gafparser.GafParser() ids = parser.validate_pipe_separated_ids( "PMID:12345", assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert set(ids) == set(["PMID:12345"]) ids = parser.validate_pipe_separated_ids( "PMID:12345|PMID:11111", assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert set(ids) == set(["PMID:12345", "PMID:11111"])
def create_base_parser(format: FormatType) -> Optional[assocparser.AssocParser]: """ Make an unconfigured parser based on the format. Only GAF is supported currently. """ parser = None if format == FormatType.GAF: parser = gafparser.GafParser(config=assocparser.AssocParserConfig()) else: parser = None return parser
def test_validate_pipe_separated_with_bad_ids(): parser = gafparser.GafParser() ids = parser.validate_pipe_separated_ids( "PMID:123[2]|PMID:11111", assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert ids == None ids = parser.validate_pipe_separated_ids( "PMID:123[2]", assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert ids == None
def test_validate_pipe_with_additional_delims(): parser = gafparser.GafParser() ids = parser.validate_pipe_separated_ids("F:123,B:234|B:111", "", extra_delims=",") assert set(ids) == set(["F:123", "B:234", "B:111"]) result = parser.parse_line( "PomBase\tSPAC25B8.17\typf1\t\tGO:1990578\tGO_REF:0000024\tISO\tUniProtKB:Q9CXD9|ensembl:ENSMUSP00000038569,PMID:11111\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20150305\tPomBase\t\t" ) assert set(result.associations[0]["evidence"]["with_support_from"]) == set( ["UniProtKB:Q9CXD9", "ensembl:ENSMUSP00000038569", "PMID:11111"])
def test_full_gaf_2_2_write(): line = "WB\tWBGene00000001\taap-1\tinvolved_in\tGO:0008286\tWB_REF:WBPaper00005614|PMID:12393910\tIMP\t\tP\t\tY110A7A.10\tgene\ttaxon:6239\t20060302\tWB\t\t" parser = gafparser.GafParser() parser.version = "2.2" out = io.StringIO() writer = assocwriter.GafWriter(file=out, version="2.2") assoc = parser.parse_line(line).associations[0] writer.write_assoc(assoc) out_line = [ line for line in out.getvalue().split("\n") if not line.startswith("!") ][0] assert out_line.split("\t") == line.split("\t")
def test_gpad_iba_writing(): out = io.StringIO() parser = gafparser.GafParser() parser.config = assocparser.AssocParserConfig(paint=True) writer = assocwriter.GpadWriter(file=out) for assoc in parser.association_generator( skipheader=True, file=open("tests/resources/wb_single_iba.gaf")): writer.write_assoc(assoc) outlines = out.getvalue().split("\n") expected_lines = [ "!gpa-version: 1.1", "WB\tWBGene00022144\tpart_of\tGO:0005886\tPMID:21873635\tECO:0000318\tPANTHER:PTN000073732|RGD:3252\t\t20180308\tGO_Central\t\t", "" ] assert expected_lines == outlines
def test_negated_qualifers(): gaf = [ "PomBase", "SPBC11B10.09", "cdc2", "NOT", "GO:0007275", "PMID:21873635", "ISO", "PANTHER:PTN000623979|TAIR:locus:2099478", "P", "Cyclin-dependent kinase 1", "UniProtKB:P04551|PTN000624043", "protein", "taxon:284812", "20170228", "GO_Central", "", "" ] parser = gafparser.GafParser() result = parser.parse_line("\t".join(gaf)) writer = assocwriter.GafWriter() parsed = writer.as_tsv(result.associations[0]) print(parsed) assert parsed[3] == "NOT" writer = assocwriter.GpadWriter() parsed = writer.as_tsv(result.associations[0]) print(parsed) assert parsed[2] == "NOT|involved_in"
def test_validate_pipe_with_additional_delims(): parser = gafparser.GafParser() ids = parser.validate_pipe_separated_ids("F:123,B:234|B:111", assocparser.SplitLine( "", [""] * 17, "taxon:foo"), extra_delims=",") assert set(ids) == set(["F:123", "B:234", "B:111"]) result = parser.parse_line( "PomBase\tSPAC25B8.17\typf1\t\tGO:1990578\tGO_REF:0000024\tISO\tUniProtKB:Q9CXD9|ensembl:ENSMUSP00000038569,PMID:11111\tC\tintramembrane aspartyl protease of the perinuclear ER membrane Ypf1 (predicted)\tppp81\tprotein\ttaxon:4896\t20150305\tPomBase\t\t" ) expected = [ association.ConjunctiveSet( elements=[association.Curie.from_str("UniProtKB:Q9CXD9")]), association.ConjunctiveSet(elements=[ association.Curie.from_str("ensembl:ENSMUSP00000038569"), association.Curie.from_str("PMID:11111") ]) ] assert result.associations[0].evidence.with_support_from == expected
def test_normalize_refs_good(): parser = gafparser.GafParser() refs = parser.normalize_refs(["PMID:123"], assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert refs == ["PMID:123"]
def _association_parser(association_type, config): if association_type == "gaf": return gafparser.GafParser(config=config)
def test_doi_id(): parser = gafparser.GafParser() valid = parser._validate_id( "DOI:10.1007/BF00127499", assocparser.SplitLine("", [""] * 17, "taxon:foo")) assert valid