def test_metadata_mismatch(): csv_path = "tests/negative.metadata_mismatch.csv" csvw1 = CSVW( csv_path=csv_path, metadata_path= "tests/negative.NumberOfNonVirtualColumnsMismatch1.csv-metadata.json") csvw2 = CSVW( csv_path=csv_path, metadata_path= "tests/negative.NumberOfNonVirtualColumnsMismatch2.csv-metadata.json") with pytest.raises(NumberOfNonVirtualColumnsMismatch) as exc: print(csvw1.to_rdf()) assert "metadata, 2" in str(exc.value) assert "row 1, 3" in str(exc.value) with pytest.raises(NumberOfNonVirtualColumnsMismatch) as exc: print(csvw2.to_rdf()) assert "metadata, 4" in str(exc.value) assert "row 1, 3" in str(exc.value) with pytest.raises(VirtualColumnPrecedesNonVirtualColumn) as exc: CSVW( csv_path=csv_path, metadata_path= 'tests/negative.VirtualColumnPrecedesNonVirtualColumn.csv-metadata.json' ) assert "t2" in str(exc.value)
def test_negative(): from pycsvw.csvw_exceptions import BothValueAndLiteralError, \ BothValueAndDatatypeError, NoValueOrLiteralError, InvalidItemError csvw = CSVW(csv_path="tests/value_urls.csv", metadata_path= "tests/value_urls.BothValueAndLiteralError.csv-metadata.json") with pytest.raises(BothValueAndLiteralError): print(csvw.to_rdf()) csvw = CSVW(csv_path="tests/value_urls.csv", metadata_path= "tests/value_urls.BothValueAndDatatypeError.csv-metadata.json") with pytest.raises(BothValueAndDatatypeError): print(csvw.to_rdf()) csvw = CSVW( csv_path="tests/value_urls.csv", metadata_path="tests/value_urls.NoValueOrLiteralError.csv-metadata.json" ) with pytest.raises(NoValueOrLiteralError): print(csvw.to_rdf()) csvw = CSVW( csv_path="tests/value_urls.csv", metadata_path="tests/value_urls.InvalidItemError.csv-metadata.json") with pytest.raises(InvalidItemError): print(csvw.to_rdf())
def test_tmp_files(): tmp_dir = tempfile.mkdtemp(dir="/tmp") assert len(os.listdir(tmp_dir)) == 0 csvw = CSVW(csv_path="./tests/books.csv", metadata_path="./tests/books.csv-metadata.json", temp_dir=tmp_dir) assert len(os.listdir(tmp_dir)) == 0 csvw.to_rdf(fmt="nt") created_files = os.listdir(tmp_dir) assert len(created_files ) == 1, "nt serialization should generate only 1 temp file" assert created_files[0].endswith(".nt") os.remove(os.path.join(tmp_dir, created_files[0])) assert len(os.listdir(tmp_dir)) == 0 csvw.to_rdf(fmt="turtle") created_files = os.listdir(tmp_dir) assert len( created_files) == 2, "ttl serialization should generate two temps file" assert any([f.endswith(".nt") for f in created_files]) assert any([f.endswith(".ttl") for f in created_files]) # Check permissions expected_flags = [stat.S_IRUSR, stat.S_IRGRP, stat.S_IROTH] unexpected_flags = [stat.S_IWUSR, stat.S_IWGRP, stat.S_IWOTH] for f in created_files: st = os.stat(os.path.join(tmp_dir, f)) for flag, non_flag in zip(expected_flags, unexpected_flags): assert bool(st.st_mode & flag) assert not bool(st.st_mode & non_flag) csvw.close() assert len(os.listdir(tmp_dir)) == 0
def test(self): metadata = None if 'metadata' in option: metadata = option['metadata'] try: csvw = CSVW(csv_file, metadata_url=metadata) except Exception as e: # this should be a negative test if TYPES[type]: traceback.print_exc() self.assertFalse(TYPES[type]) return # if we get here this should be a positive test self.assertTrue(TYPES[type]) # if we can parse it we should at least produce some embedded metadata self.assertNotEqual(csvw.metadata, None) # and the result should exists self.assertNotEqual(result_url, None) gr = Graph() result = gr.parse(result_url) converted_result = csvw.to_rdf() result.serialize('output_rdf/' + name + '.ttl', format='turtle') converted_result.serialize('output_rdf/generated' + name + '.ttl', format='turtle') self.assertTrue(compare.isomorphic(result, converted_result))
def test_default_with_datatype(): csvw = CSVW( csv_path='tests/virtual1.csv', metadata_path='tests/virtual1.default.datatype.csv-metadata.json') rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") ns = Namespace("http://example.org/") for x in [1, 2]: active_vals = list( g.triples((ns['sub-{}'.format(x)], ns['active'], None))) assert len(active_vals) == 1 active_val = active_vals[0][2] assert isinstance(active_val, Literal) assert active_val.datatype == XSD.boolean assert active_val.value string_vals = list( g.triples((ns['sub-{}'.format(x)], ns['stringprop1'], None))) assert len(string_vals) == 1 string_val = string_vals[0][2] assert isinstance(string_val, Literal) assert string_val.value == "some string" string_vals = list( g.triples((ns['sub-{}'.format(x)], ns['stringprop2'], None))) assert len(string_vals) == 1 string_val = string_vals[0][2] assert isinstance(string_val, Literal) assert "%20" not in string_val.value
def test_null_values_with_single_string(): csvw = CSVW(csv_path="tests/null1.csv", metadata_path="tests/null1.single.csv-metadata.json") rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") # There should be no subject NA all_subjects = {x for x in g.subjects()} assert subj_ns['null_key'] not in all_subjects assert subj_ns['1'] in all_subjects assert len(all_subjects) == 4 # Null valued objects should not be created all_objects = {x for x in g.objects()} assert Literal('null_key', datatype=XSD.token) not in all_objects assert Literal('null_sector') not in all_objects assert Literal('null_id', datatype=XSD.token) not in all_objects assert Literal('PUBLIC') in all_objects assert Literal('12', datatype=XSD.token) in all_objects # Spot check some triples do not exist but other do from the same row null_key_lit = Literal('null_id', datatype=XSD.token) assert len(list(g.triples((subj_ns['2'], id_uri, null_key_lit)))) == 0 priv_lit = Literal('PRIVATE') assert len(list(g.triples((subj_ns['2'], sect_uri, priv_lit)))) == 1 null_sector_lit = Literal('null_sector') assert len(list(g.triples((subj_ns['3'], sect_uri, null_sector_lit)))) == 0 twelve_lit = Literal('12', datatype=XSD.token) assert len(list(g.triples((subj_ns['3'], id_uri, twelve_lit)))) == 1
def test_literals_with_new_lines(): csv_path = "tests/parsing.quoted_newlines.csv" metadata_path = "tests/parsing.quoted_newlines.csv-metadata.json" csvw = CSVW(csv_path=csv_path, metadata_path=metadata_path) rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") ns = Namespace("http://example.org/expense/") desc = URIRef("http://example.org/desc") taxi_triples = list(g.triples((ns['taxi'], desc, None))) assert len(taxi_triples) == 1 taxi_desc = taxi_triples[0][2] assert isinstance(taxi_desc, Literal) assert len(taxi_desc.value.splitlines()) == 2 flight = URIRef("http://example.org/expense/multi-hop%20flight") flight_triples = list(g.triples((flight, desc, None))) assert len(flight_triples) == 1 flight_desc = flight_triples[0][2] assert isinstance(flight_desc, Literal) assert len(flight_desc.value.splitlines()) == 4 dinner_triples = list(g.triples((ns['dinner'], desc, None))) assert len(dinner_triples) == 1 dinner_desc = dinner_triples[0][2] assert isinstance(dinner_desc, Literal) assert u'\u2019' in dinner_desc, "Expected to read unicode characters" assert u"('')" in dinner_desc, "Expected to read apostrophes"
def test_single_table_using_path(): csv_path = "tests/simple.csv" metadata_path = "tests/simple.csv-metadata.json" csvw = CSVW(csv_path=csv_path, metadata_path=metadata_path) rdf = csvw.to_rdf() verify_rdf(rdf)
def test_empty(): csvw = CSVW(csv_path="tests/empty.csv", metadata_path="tests/empty.csv-metadata.json") rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") assert len(g) == 0
def test_empty_boolean(): csvw = CSVW(csv_path="tests/empty.csv", metadata_path="tests/empty.bool.csv-metadata.json") rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") assert len(g) == 2 assert len(list(g.triples((None, None, Literal(False))))) == 2 csvw = CSVW(csv_path="tests/empty.csv", metadata_path="tests/empty.invalid_base.csv-metadata.json") rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") assert len(g) == 0
def test_multiple_tables_through_paths(): metadata_path = "tests/multiple_tables.csv-metadata.json" csv1_path = "tests/multiple_tables.Name-ID.csv" csv2_path = "tests/multiple_tables.ID-Age.csv" with open(metadata_path, 'r') as metadata_f: metadata = io.StringIO(text(metadata_f.read())) csvw = CSVW(csv_path=(csv1_path, csv2_path), metadata_handle=metadata) rdf = csvw.to_rdf() verify_rdf(rdf)
def test_single_table_using_handles(): csv_path = "tests/simple.csv" metadata_path = "tests/simple.csv-metadata.json" with io.open(csv_path) as csv1_f, io.open(metadata_path, 'r') as metadata_f: csv_handle = io.StringIO(csv1_f.read()) metadata = io.StringIO(metadata_f.read()) csvw = CSVW(csv_handle=csv_handle, metadata_handle=metadata) rdf = csvw.to_rdf() verify_rdf(rdf)
def test_multiple_tables_through_handles(): metadata_path = "tests/multiple_tables.csv-metadata.json" csv1_path = "tests/multiple_tables.Name-ID.csv" csv2_path = "tests/multiple_tables.ID-Age.csv" with io.open(metadata_path, 'r') as metadata_f, io.open( csv1_path) as csv1_f, io.open(csv2_path) as csv2_f: metadata = io.StringIO(metadata_f.read()) csv1 = io.StringIO(csv1_f.read()) csv2 = io.StringIO(csv2_f.read()) csvw = CSVW(csv_handle=[csv1, csv2], metadata_handle=metadata) rdf = csvw.to_rdf() verify_rdf(rdf)
def test_default(): csvw = CSVW(csv_path='tests/virtual1.csv', metadata_path='tests/virtual1.default.csv-metadata.json') rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") all_subjects = {x for x in g.subjects()} assert len(all_subjects) == 4 ns = Namespace("http://example.org/") assert ns['sub-1'] in all_subjects assert ns['sub-2'] in all_subjects assert len([g.triples((ns['sub-1'], ns['obj-1'], ns['myvalue']))]) == 1 assert len([g.triples((ns['sub-2'], ns['obj-2'], ns['myvalue']))]) == 1
def test_single_table_using_url(mock_urlopen): csv_path = "tests/simple.csv" metadata_path = "tests/simple.csv-metadata.json" csv_url = "http://example.org/simple.csv" with io.open(csv_path) as csv1_f: csv1 = text(csv1_f.read()) reader = Mock() reader.read.side_effect = [csv1] mock_urlopen.return_value = reader csvw = CSVW(csv_url=csv_url, metadata_path=metadata_path) rdf = csvw.to_rdf() verify_rdf(rdf)
def test_encoding_rdf(): # With encoding specified encoding = "ISO-8859-1" csvw = CSVW(csv_path="./tests/iso_encoding.csv", metadata_path="./tests/iso_encoding.csv-metadata.json", csv_encoding=encoding) rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") units = Namespace('http://example.org/units/') cars = Namespace('http://example.org/cars/') meta = Namespace("http://example.org/properties/") expected_unit = units[quote(u"\xb5100".encode('utf-8'))] assert (cars['1'], meta['UnitOfMeasurement'], expected_unit) in g assert expected_unit in list(g.objects())
def test_bool_with_format(): csvw = CSVW(csv_path="tests/datatypes.bool.csv", metadata_path="tests/datatypes.bool.csv-metadata.json") rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") true_lit = Literal(True, datatype=XSD.boolean) false_lit = Literal(False, datatype=XSD.boolean) assert len(list(g.triples((NS['event/1'], NS['bool1'], true_lit)))) == 1 assert len(list(g.triples((NS['event/1'], NS['bool2'], true_lit)))) == 1 assert len(list(g.triples((NS['event/1'], NS['bool3'], true_lit)))) == 1 assert len(list(g.triples((NS['event/2'], NS['bool1'], false_lit)))) == 1 assert len(list(g.triples((NS['event/2'], NS['bool2'], false_lit)))) == 1 assert len(list(g.triples((NS['event/2'], NS['bool3'], false_lit)))) == 1 assert len(list(g.triples((NS['event/3'], NS['bool1'], false_lit)))) == 1 assert len(list(g.triples((NS['event/3'], NS['bool2'], false_lit)))) == 1 assert len(list(g.triples((NS['event/3'], NS['bool3'], false_lit)))) == 1
def test_null_values_with_multiple_strings(): csvw = CSVW(csv_path="tests/null1.csv", metadata_path="tests/null1.multiple.csv-metadata.json") rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") all_objects = {x for x in g.objects()} assert Literal('null_key', datatype=XSD.token) not in all_objects assert Literal('null_sector') not in all_objects assert Literal('null_id', datatype=XSD.token) not in all_objects for id in ['10', '11', '12', '13']: assert Literal(id, datatype=XSD.token) not in all_objects all_preds = {x for x in g.predicates()} assert id_uri not in all_preds assert Literal('1', datatype=XSD.token) not in all_objects
def test_multiple_tables_through_urls(mock_urlopen): metadata_path = "tests/multiple_tables.csv-metadata.json" csv1_url = "multiple_tables.Name-ID.csv" csv2_url = "multiple_tables.ID-Age.csv" csv1_path = "tests/multiple_tables.Name-ID.csv" csv2_path = "tests/multiple_tables.ID-Age.csv" with io.open(metadata_path, 'r') as metadata_f, io.open( csv1_path) as csv1_f, io.open(csv2_path) as csv2_f: metadata = io.StringIO(text(metadata_f.read())) csv1 = text(csv1_f.read()) csv2 = text(csv2_f.read()) reader = Mock() reader.read.side_effect = [csv1, csv2] mock_urlopen.return_value = reader csvw = CSVW(csv_url=(csv1_url, csv2_url), metadata_handle=metadata) rdf = csvw.to_rdf() verify_rdf(rdf)
def test_literals_with_escaped_quotes(): csv_path = "tests/parsing.escaped_quotes.csv" metadata_path = "tests/parsing.escaped_quotes.csv-metadata.json" csvw = CSVW(csv_path=csv_path, metadata_path=metadata_path) rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") ns = Namespace("http://example.org/expense/") desc = URIRef("http://example.org/desc") taxi_triples = list(g.triples((ns['taxi'], desc, None))) assert len(taxi_triples) == 1 taxi_desc = taxi_triples[0][2] assert isinstance(taxi_desc, Literal) assert taxi_desc.value == "go from x to y" quoted_expense_triples = list( g.triples((URIRef("http://example.org/expense/quoted%20expense"), desc, None))) assert len(quoted_expense_triples) == 1 quoted_expense_desc = quoted_expense_triples[0][2] assert isinstance(quoted_expense_desc, Literal) assert quoted_expense_desc.value == "for some reason it came with quotes in it" flight_triples = list(g.triples((ns['flight'], desc, None))) assert len(flight_triples) == 1 flight_desc = flight_triples[0][2] assert isinstance(flight_desc, Literal) assert flight_desc.value == "had to fly \"escaped quotes business\" for this trip" car_triples = list(g.triples((ns['car'], desc, None))) assert len(car_triples) == 1 car_desc = car_triples[0][2] assert isinstance(car_desc, Literal) assert car_desc.value == " some \ in it to be escaped"
def test_rdf_events_listing(): # Generate rdf csvw = CSVW( csv_path="tests/examples/events-listing.csv", metadata_path="tests/examples/events-listing.csv-metadata.json") # RIOT throws relative IRI warnings for this example. with warnings.catch_warnings(): warnings.simplefilter("ignore", RiotWarning) rdf_output = csvw.to_rdf() g = rdflib.Graph().parse(data=rdf_output, format="turtle") assert len(g) == 20, "Expected 20 triples" # Subjects subjects = set([x for x in g.subjects()]) assert len(subjects) == 6, "There should be 6 subjects" expected_subjects = [ "#event-1", "#event-2", "#place-1", "#place-2", "#offer-1", "#offer-2" ] for s in expected_subjects: assert any([s in str(x) for x in subjects ]), "{} expected to be among subjects".format(s) predicates = set([x for x in g.predicates()]) assert len(predicates) == 7, "There should be 7 predicates" schema = rdflib.Namespace("http://schema.org/") expected_preds = [ rdflib.RDF.type, schema.name, schema.startDate, schema.location, schema.offers, schema.address, schema.url ] for p in expected_preds: assert p in expected_preds, "{} expected to be among predicates" events = [ x[0] for x in g.triples((None, rdflib.RDF.type, schema.MusicEvent)) ] assert len(events) == 2, "Expected 2 events" assert any(["#event-1" in str(e) for e in events]) assert any(["#event-2" in str(e) for e in events]) event_names = [] for e in events: for x in g.triples((e, schema.name, None)): event_names.append(x[2]) event_names = set(event_names) assert len(event_names) == 1, "There should be one unique event name" assert str(next(iter(event_names)) ) == "B.B. King", "The unique event name should be 'B.B. King'" start_date_data = [(x[0], x[2]) for x in g.triples((None, schema.startDate, None))] assert len(start_date_data) == 2, "There should be 2 start dates" for s in start_date_data: assert s[ 1].datatype == rdflib.XSD.dateTime, "start dates should be xsd:dateTime" events = [s[0] for s in start_date_data] assert any(["#event-1" in str(e) for e in events]) assert any(["#event-2" in str(e) for e in events]) location_data = [(x[0], x[2]) for x in g.triples((None, schema.location, None))] assert len(location_data) == 2, "There should be 2 locations" locations = [x[1] for x in location_data] assert any(["#place-1" in str(l) for l in locations]) assert any(["#place-2" in str(l) for l in locations]) events = [s[0] for s in location_data] assert any(["#event-1" in str(e) for e in events]) assert any(["#event-2" in str(e) for e in events]) offer_data = [(x[0], x[2]) for x in g.triples((None, schema.offers, None))] assert len(offer_data) == 2, "There should be 2 locations" offers = [x[1] for x in offer_data] assert any(["#offer-1" in str(l) for l in offers]) assert any(["#offer-2" in str(l) for l in offers]) events = [s[0] for s in offer_data] assert any(["#event-1" in str(e) for e in events]) assert any(["#event-2" in str(e) for e in events]) offers = [x for x in g.triples((None, rdflib.RDF.type, schema.Offer))] assert len(offers) == 2, "There should be 2 offers" assert any(["#offer-1" in str(e) for e in offers]) assert any(["#offer-2" in str(e) for e in offers]) urls = [x for x in g.triples((None, schema.url, None))] assert len(urls) == 2, "There should be 2 urls" for u in urls: assert u[ 2].datatype == rdflib.XSD.anyURI, "urls should be of type xsd:anyURI" places = [x for x in g.triples((None, rdflib.RDF.type, schema.Place))] assert len(places) == 2, "There should be 2 places" for p in places: assert len(list(g.triples( (p[0], schema.name, None)))) == 1, "Place should have one name" assert len(list(g.triples( (p[0], schema.address, None)))) == 1, "Place should have one address"
def test_rdf_tree_ops_ext(): # Generate rdf csvw = CSVW(csv_path="tests/examples/tree-ops-ext.csv", metadata_path="tests/examples/tree-ops-ext.csv-metadata.json") rdf_output = csvw.to_rdf() g = rdflib.Graph().parse(data=rdf_output, format="turtle") # Validate rdf total_triples = (9-1)*3 - 2 + 7 # 9 columns, 1 suppressed, two null comments and 7 separated comments assert len(g) == total_triples, "Expecting a total of {} triples".format(total_triples) table_url = "http://example.org/tree-ops-ext.csv" # Subject subjects = set([x for x in g.subjects()]) assert len(subjects) == 3, "Expecting 3 subjects, one for each row" # Subjects should be named as specified by aboutURL and primary key expected_preds = ["on_street", "species", "trim_cycle", "dbh", "inventory_date", "protected", "kml"] pred_comments = rdflib.URIRef("{}#comments".format(table_url)) for ind in [1, 2, 6]: uri = rdflib.URIRef("http://example.org/tree-ops-ext#gid-{}".format(ind)) assert uri in subjects, "{} not among subjects".format(uri) triples = [t for t in g.triples((uri, None, None))] preds = [x[1] for x in triples] for p in expected_preds: p_uri = rdflib.URIRef("{}#{}".format(table_url, p)) assert p_uri in preds, "Predicate {} expected to be in triples of {}".format( p_uri, uri) if ind == 6: assert len(triples) == 15, "{} expected to have 15 triples".format(uri) assert pred_comments in preds, "Predicate {} should not be there for {}".format( pred_comments, uri) else: assert len(triples) == 7, "{} expected to have 7 triples".format(uri) assert pred_comments not in preds, "Predicate {} should be there for {}".format( pred_comments, uri) # Predicates predicates = set([x for x in g.predicates()]) assert len(predicates) == 8, "Expecting 8 different predicates" # Objects on_street_objs = [] for t in g.triples((None, rdflib.URIRef("{}#on_street".format(table_url)), None)): on_street_objs.append(t[2]) assert len(on_street_objs) == 3, "There should be 3 on_street triples" for o in on_street_objs: assert o.datatype is None, "datatype should not be specified for on_street data" assert o.language is None, "language should not be specified for on_street data" species_objs = [] for t in g.triples((None, rdflib.URIRef("{}#species".format(table_url)), None)): species_objs.append(t[2]) assert len(species_objs) == 3, "There should be 3 species triples" for o in species_objs: assert o.datatype is None, "datatype should not be specified for species data" assert o.language is None, "language should not be specified for species data" trim_cycle_objs = [] for t in g.triples((None, rdflib.URIRef("{}#trim_cycle".format(table_url)), None)): trim_cycle_objs.append(t[2]) assert len(trim_cycle_objs) == 3, "There should be 3 trim_cycle triples" for o in trim_cycle_objs: assert o.datatype is None, "datatype should not be specified for trim_cycle data" assert str(o.language) == "en", "language should be specified as en for trim_cycle data" dbh_objs = [] for t in g.triples((None, rdflib.URIRef("{}#dbh".format(table_url)), None)): dbh_objs.append(t[2]) assert len(dbh_objs) == 3, "There should be 3 dbh triples" for o in dbh_objs: assert isinstance(o.value, long), "dbh should be an integer" assert o.datatype == rdflib.XSD.integer, "datatype should be specified as integer for dbh data" assert o.language is None, "language should not be specified for dbh data" inv_date_objs = [] for t in g.triples((None, rdflib.URIRef("{}#inventory_date".format(table_url)), None)): inv_date_objs.append(t[2]) assert len(inv_date_objs) == 3, "There should be 3 inventory_date triples" for o in inv_date_objs: assert isinstance(o.value, date), "inventory_date should be a date" assert o.value.year == 2010, "inventory_date year should be 2010" assert o.datatype == rdflib.XSD.date, "datatype should be specified as date for inventory_date data" assert o.language is None, "language should not be specified for dbh data" protected_objs = [] for t in g.triples((None, rdflib.URIRef("{}#protected".format(table_url)), None)): protected_objs.append(t[2]) assert len(protected_objs) == 3, "There should be 3 protected triples" for o in protected_objs: assert isinstance(o.value, bool), "protected should be a bool" assert o.datatype == rdflib.XSD.boolean, "datatype should be specified as bool for protected data" assert o.language is None, "language should not be specified for protected data" kml_objs = [] for t in g.triples((None, rdflib.URIRef("{}#kml".format(table_url)), None)): kml_objs.append(t[2]) assert len(kml_objs) == 3, "There should be 3 dbh triples" for o in kml_objs: assert isinstance(o.value, xml.dom.minidom.Document), "kml should be an XML document" assert o.datatype == rdflib.RDF.XMLLiteral, "datatype should be specified as xml for kml data" assert o.language is None, "language should not be specified for kml data" comments_objs = [] for t in g.triples((None, pred_comments, None)): comments_objs.append(t[2]) assert len(comments_objs) == 8, "There should be 8 comments" for o in comments_objs: assert o.datatype is None, "datatype should not be specified for comments data" assert o.language is None, "language should not be specified for comments data"
def test_group_of_tables(mock_urlopen): mock_urlopen.side_effect = dispatch_files_as_url csv_urls = [ "http://example.org/gov.uk/data/organizations.csv", "http://example.org/gov.uk/data/professions.csv", "http://example.org/senior-roles.csv", "http://example.org/junior-roles.csv" ] csvw = CSVW(csv_url=csv_urls, metadata_url="http://example.org/csv-metadata.json") with warnings.catch_warnings(): warnings.simplefilter("ignore", RiotWarning) rdf_output = csvw.to_rdf() g = rdflib.Graph().parse(data=rdf_output, format="turtle") org = Namespace("http://www.w3.org/ns/org#") post_in = URIRef("http://example.org/organization/hefce.ac.uk") grade = URIRef("http://example.org/gov.uk/def/grade") job = URIRef("http://example.org/gov.uk/def/job") prof = URIRef("http://example.org/gov.uk/def/profession") post = Namespace("http://example.org/organization/hefce.ac.uk/post/") person = Namespace("http://example.org/organization/hefce.ac.uk/person/") min_pay = URIRef("http://example.org/gov.uk/def/min_pay") max_pay = URIRef("http://example.org/gov.uk/def/max_pay") num_posts = URIRef("http://example.org/gov.uk/def/number_of_posts") post_90115 = post["90115"] post_90334 = post["90334"] p1 = person["1"] p2 = person["2"] post_90115_triples = list(g.triples((post_90115, None, None))) assert len(post_90115_triples) == 7 assert (post_90115, DCTERMS.identifier, Literal("90115")) in post_90115_triples assert (post_90115, org.heldBy, p1) in post_90115_triples assert (post_90115, grade, Literal("SCS1A")) in post_90115_triples assert (post_90115, job, Literal("Deputy Chief Executive")) in post_90115_triples assert (post_90115, org.reportsTo, post_90334) in post_90115_triples assert (post_90115, prof, Literal("Finance")) in post_90115_triples assert (post_90115, org.postIn, post_in) in post_90115_triples p1_triples = list(g.triples((p1, None, None))) assert len(p1_triples) == 1 assert (p1, FOAF.name, Literal("Steve Egan")) in p1_triples post_90334_triples = list(g.triples((post_90334, None, None))) assert len(post_90334_triples) == 6 assert (post_90334, DCTERMS.identifier, Literal("90334")) in post_90334_triples assert (post_90334, org.heldBy, p2) in post_90334_triples assert (post_90334, grade, Literal("SCS4")) in post_90334_triples assert (post_90334, job, Literal("Chief Executive")) in post_90334_triples assert (post_90334, prof, Literal("Policy")) in post_90334_triples assert (post_90334, org.postIn, post_in) in post_90334_triples p2_triples = list(g.triples((p2, None, None))) assert len(p2_triples) == 1 assert (p2, FOAF.name, Literal("Sir Alan Langlands")) in p2_triples bnode1 = list(g.triples((None, grade, Literal("4"))))[0][0] b1_triples = list(g.triples((bnode1, None, None))) assert len(b1_triples) == 8 assert (bnode1, org.reportsTo, post_90115) in b1_triples assert (bnode1, min_pay, Literal(17426, datatype=XSD.integer)) in b1_triples assert (bnode1, max_pay, Literal(20002, datatype=XSD.integer)) in b1_triples assert (bnode1, job, Literal("Administrator")) in b1_triples assert (bnode1, num_posts, Literal(8.67, datatype=XSD.double)) in b1_triples assert (bnode1, prof, Literal("Operational Delivery")) in b1_triples assert (bnode1, org.postIn, post_in) in b1_triples bnode2 = list(g.triples((None, grade, Literal("5"))))[0][0] b2_triples = list(g.triples((bnode2, None, None))) assert len(b2_triples) == 8 assert (bnode2, org.reportsTo, post_90115) in b2_triples assert (bnode2, min_pay, Literal(19546, datatype=XSD.integer)) in b2_triples assert (bnode2, max_pay, Literal(22478, datatype=XSD.integer)) in b2_triples assert (bnode2, job, Literal("Administrator")) in b2_triples assert (bnode2, num_posts, Literal(0.5, datatype=XSD.double)) in b2_triples assert (bnode2, prof, Literal("Operational Delivery")) in b2_triples assert (bnode2, org.postIn, post_in) in b2_triples assert len(list(g.triples((None, None, None)))) == 7 + 1 + 6 + 1 + 8 + 8
def test_multiple_value_urls_in_virtual(): csvw = CSVW(csv_path="tests/value_urls.csv", metadata_path="tests/value_urls.csv-metadata.json") rdf_contents = csvw.to_rdf(fmt="nt") g = ConjunctiveGraph() g.parse(data=rdf_contents, format="nt") # Test subjects all_subjects = list(g.subjects()) s_amount = NS['amount'] s_desc = NS['description'] s_id = NS['id'] assert s_amount in all_subjects assert s_desc in all_subjects assert s_id in all_subjects # Test descriptions p_def = NS['definition'] assert len(list(g.triples( (s_amount, p_def, Literal("the amount paid"))))) == 1 assert len( list(g.triples( (s_desc, p_def, Literal("description of the expense"))))) == 1 assert len(list(g.triples((s_id, p_def, Literal("transaction id"))))) == 1 # Test each is a element type o_element = NS['element'] assert len(list(g.triples((s_amount, RDF.type, o_element)))) == 1 assert len(list(g.triples((s_desc, RDF.type, o_element)))) == 1 assert len(list(g.triples((s_id, RDF.type, o_element)))) == 1 # Test that range is specified r_amount = NS['element/amount-RANGE'] r_desc = NS['element/description-RANGE'] r_id = NS['element/id-RANGE'] assert len(list(g.triples((s_amount, RDFS.range, r_amount)))) == 1 assert len(list(g.triples((s_desc, RDFS.range, r_desc)))) == 1 assert len(list(g.triples((s_id, RDFS.range, r_id)))) == 1 # Range is another subject assert r_amount in all_subjects assert r_desc in all_subjects assert r_id in all_subjects # Range is a OWL datatype of specified type assert len(list(g.triples((r_amount, OWL.onDatatype, XSD.decimal)))) == 1 assert len(list(g.triples((r_desc, OWL.onDatatype, XSD.string)))) == 1 assert len(list(g.triples((r_id, OWL.onDatatype, XSD.integer)))) == 1 # Check the restrictions for amount rest_amount_node = list(g.triples((r_amount, OWL.withRestrictions, None))) rest_amount_node = rest_amount_node[0][2] assert isinstance(rest_amount_node, BNode) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.decimal)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.MaxLength)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len( list( g.triples((rest_amount_node, RDF.first, Literal(10, datatype=XSD.nonNegativeInteger))))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.MinLength)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len( list( g.triples((rest_amount_node, RDF.first, Literal(1, datatype=XSD.nonNegativeInteger))))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples((rest_amount_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_amount_node, RDF.rest, None)))) == 0 # Check the restrictions for description rest_desc_node = list(g.triples((r_desc, OWL.withRestrictions, None))) rest_desc_node = rest_desc_node[0][2] assert isinstance(rest_desc_node, BNode) assert len(list(g.triples((rest_desc_node, RDF.first, XSD.string)))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_desc_node, RDF.first, XSD.MaxLength)))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len( list( g.triples((rest_desc_node, RDF.first, Literal(100, datatype=XSD.nonNegativeInteger))))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len(list(g.triples((rest_desc_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_desc_node, RDF.rest, None)))) == 0 # Check the restrictions for id rest_id_node = list(g.triples((r_id, OWL.withRestrictions, None))) rest_id_node = rest_id_node[0][2] assert isinstance(rest_id_node, BNode) assert len(list(g.triples((rest_id_node, RDF.first, XSD.integer)))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len(list(g.triples((rest_id_node, RDF.first, XSD.MinLength)))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len( list( g.triples((rest_id_node, RDF.first, Literal(0, datatype=XSD.nonNegativeInteger))))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len(list(g.triples((rest_id_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_id_node, RDF.rest, None)))) == 0 # Check constant value for each const_prop = NS['another-list-value-with-constants'] for s in [r_amount, r_id, r_desc]: constant_node = list(g.triples((r_amount, const_prop, None))) constant_node = constant_node[0][2] assert isinstance(constant_node, BNode) assert len(list(g.triples( (constant_node, RDF.first, XSD.Length)))) == 1 constant_node = next( g.objects(subject=constant_node, predicate=RDF.rest)) assert len( list( g.triples((constant_node, RDF.first, Literal(1, datatype=XSD.nonNegativeInteger))))) == 1 constant_node = next( g.objects(subject=constant_node, predicate=RDF.rest)) assert len(list(g.triples((constant_node, RDF.first, None)))) == 0 assert len(list(g.triples((constant_node, RDF.rest, None)))) == 0 # Verify that empty valueUrl does not end up in graph or rdf contents assert NS['empty-list-predicate1'] not in list(g.objects()) assert "empty-list-predicate1" not in rdf_contents # Verify that empty valueUrl does not end up in graph assert NS['empty-list-predicate2'] not in list(g.objects()) assert "empty-list-predicate2" not in rdf_contents # Test total number of lists through rdf:nils in order to verify each list # ends up with a nil test_num_lists = 3 * 3 # 3 rows and 3 virtual list valued columns nil_text = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ." assert rdf_contents.count(nil_text) == test_num_lists
def test_negative_invalid_column(): csvw = CSVW(csv_path='tests/virtual1.csv', metadata_path='tests/virtual1.negative2.csv-metadata.json') with pytest.raises(FailedSubstitutionError): print(csvw.to_rdf())
TRIPLES_SIZES = [ 20000, 30000, 50000, 100000, 200000, 300000, 500000, 1000000, 2000000 ] FORMATS = ["turtle", "xml", "json-ld"] for fmt in FORMATS: print "|Number of triples|pycsvw {fmt} (sec)|rdflib {fmt} (sec)|".format( fmt=fmt) for num_triples in TRIPLES_SIZES: generate_csv_and_metadata(num_triples) start = time.time() # Generate nt first for fairness to pycsvw csvw = CSVW( csv_path="csvfile.{}.csv".format(num_triples), metadata_path="csvfile.{}.csv-metadata.json".format(num_triples)) pycsvw_output = csvw.to_rdf(fmt) with open( "{fmt}file.{num_t}.pycsvw.{fmt}".format(fmt=fmt, num_t=num_triples), "w") as out_file: out_file.write(pycsvw_output.encode("utf-8")) pycsvw_time = time.time() - start # Write the same contents into an nt-file using rdflib num_rows = int(num_triples) / NUM_COLS start = time.time() g = ConjunctiveGraph() for row in xrange(num_rows): for col in xrange(NUM_COLS): g.add( (URIRef(
from speed_test_csv import NUM_COLS, generate_csv_and_metadata TRIPLES_SIZES = [ 20000, 30000, 50000, 100000, 200000, 300000, 500000, 1000000, 2000000 ] print "|Number of triples|pycsvw (sec)|rdflib (sec)|" for num_triples in TRIPLES_SIZES: # Generate csv and its metadata generate_csv_and_metadata(num_triples) # Generate NT using pycsvw start = time.time() csvw = CSVW( csv_path="csvfile.{}.csv".format(num_triples), metadata_path="csvfile.{}.csv-metadata.json".format(num_triples)) nt_output = csvw.to_rdf("nt") with open("ntfile.{}.pycsvw.nt".format(num_triples), "w") as nt_file: nt_file.write(nt_output.encode("utf-8")) pycsvw_nt_time = time.time() - start # Generate equivalent contents using rdflib num_rows = int(num_triples) / NUM_COLS start = time.time() g = ConjunctiveGraph() for row in xrange(num_rows): for col in xrange(NUM_COLS): g.add( (URIRef("http://www.example.org/subjectrow{}col0".format(row)), URIRef("http://www.example.org/predcolumn{}".format(col)), Literal("row{}col{}".format(row, col))))
def test_url_safe_chars(): csvw = CSVW(csv_path="tests/url_special_chars.csv", metadata_path="tests/url_special_chars.csv-metadata.json") rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") # Check subjects sub1 = URIRef( 'http://www.example.org/c#1/chash2/chash3/chash4/chash5/chash6') literals = [ Literal('c#1'), Literal('chash2'), Literal('chash3'), Literal('chash4'), Literal('chash6'), Literal('chash5') ] verify_non_virtual_columns(sub1, g, literals) verify_virtual_columns(sub1, g, '#/:- _r1', '#/:-%20_r1') sub2 = URIRef('http://www.example.org/c/1/c/2/c/3/c/4/c/5/c/6') literals = [ Literal('c/1'), Literal('c/2'), Literal('c/3'), Literal('c/4'), Literal('c/6'), Literal('c/5') ] verify_non_virtual_columns(sub2, g, literals) verify_virtual_columns(sub2, g, '/#:- _r2', '/#:-%20_r2') sub3 = URIRef('http://www.example.org/c:1/c:2/c:3/c:4/c:5/c:6') literals = [ Literal('c:1'), Literal('c:2'), Literal('c:3'), Literal('c:4'), Literal('c:6'), Literal('c:5') ] verify_non_virtual_columns(sub3, g, literals) verify_virtual_columns(sub3, g, ':#/-_ r3', ':#/-_%20r3') sub4 = URIRef('http://www.example.org/c-1/c-2/c-3/c-4/c-5/c-6') literals = [ Literal('c-1'), Literal('c-2'), Literal('c-3'), Literal('c-4'), Literal('c-6'), Literal('c-5') ] verify_non_virtual_columns(sub4, g, literals) verify_virtual_columns(sub4, g, '-/#_ :r4', '-/#_%20:r4') sub5 = URIRef('http://www.example.org/c%201/c%202/c%203/c%204/c%205/c%206') literals = [ Literal('c 1'), Literal('c 2'), Literal('c 3'), Literal('c 4'), Literal('c 6'), Literal('c 5') ] verify_non_virtual_columns(sub5, g, literals) verify_virtual_columns(sub5, g, ' -/#:_r5', '%20-/#:_r5') sub6 = URIRef('http://www.example.org/c_1/c_2/c_3/c_4/c_5/c_6') literals = [ Literal('c_1'), Literal('c_2'), Literal('c_3'), Literal('c_4'), Literal('c_6'), Literal('c_5') ] verify_non_virtual_columns(sub6, g, literals) verify_virtual_columns(sub6, g, '_ /:#r6', '_%20/:#r6')
def test_individual_formats(fmt, validate_func, rdflib_input): csvw = CSVW(csv_path="./tests/books.csv", metadata_path="./tests/books.csv-metadata.json") rdf_output = csvw.to_rdf(fmt=fmt) validate_func(rdf_output) verify_rdf_contents(rdf_output, rdflib_input)