Exemplo n.º 1
0
def test_metadata_mismatch():
    csv_path = "tests/negative.metadata_mismatch.csv"

    csvw1 = CSVW(
        csv_path=csv_path,
        metadata_path=
        "tests/negative.NumberOfNonVirtualColumnsMismatch1.csv-metadata.json")
    csvw2 = CSVW(
        csv_path=csv_path,
        metadata_path=
        "tests/negative.NumberOfNonVirtualColumnsMismatch2.csv-metadata.json")

    with pytest.raises(NumberOfNonVirtualColumnsMismatch) as exc:
        print(csvw1.to_rdf())
    assert "metadata, 2" in str(exc.value)
    assert "row 1, 3" in str(exc.value)

    with pytest.raises(NumberOfNonVirtualColumnsMismatch) as exc:
        print(csvw2.to_rdf())
    assert "metadata, 4" in str(exc.value)
    assert "row 1, 3" in str(exc.value)

    with pytest.raises(VirtualColumnPrecedesNonVirtualColumn) as exc:
        CSVW(
            csv_path=csv_path,
            metadata_path=
            'tests/negative.VirtualColumnPrecedesNonVirtualColumn.csv-metadata.json'
        )
    assert "t2" in str(exc.value)
Exemplo n.º 2
0
def test_negative():
    from pycsvw.csvw_exceptions import BothValueAndLiteralError, \
        BothValueAndDatatypeError, NoValueOrLiteralError, InvalidItemError

    csvw = CSVW(csv_path="tests/value_urls.csv",
                metadata_path=
                "tests/value_urls.BothValueAndLiteralError.csv-metadata.json")
    with pytest.raises(BothValueAndLiteralError):
        print(csvw.to_rdf())

    csvw = CSVW(csv_path="tests/value_urls.csv",
                metadata_path=
                "tests/value_urls.BothValueAndDatatypeError.csv-metadata.json")
    with pytest.raises(BothValueAndDatatypeError):
        print(csvw.to_rdf())

    csvw = CSVW(
        csv_path="tests/value_urls.csv",
        metadata_path="tests/value_urls.NoValueOrLiteralError.csv-metadata.json"
    )
    with pytest.raises(NoValueOrLiteralError):
        print(csvw.to_rdf())

    csvw = CSVW(
        csv_path="tests/value_urls.csv",
        metadata_path="tests/value_urls.InvalidItemError.csv-metadata.json")
    with pytest.raises(InvalidItemError):
        print(csvw.to_rdf())
Exemplo n.º 3
0
def test_tmp_files():
    tmp_dir = tempfile.mkdtemp(dir="/tmp")
    assert len(os.listdir(tmp_dir)) == 0
    csvw = CSVW(csv_path="./tests/books.csv",
                metadata_path="./tests/books.csv-metadata.json",
                temp_dir=tmp_dir)
    assert len(os.listdir(tmp_dir)) == 0

    csvw.to_rdf(fmt="nt")
    created_files = os.listdir(tmp_dir)
    assert len(created_files
               ) == 1, "nt serialization should generate only 1 temp file"
    assert created_files[0].endswith(".nt")

    os.remove(os.path.join(tmp_dir, created_files[0]))
    assert len(os.listdir(tmp_dir)) == 0

    csvw.to_rdf(fmt="turtle")
    created_files = os.listdir(tmp_dir)
    assert len(
        created_files) == 2, "ttl serialization should generate two temps file"
    assert any([f.endswith(".nt") for f in created_files])
    assert any([f.endswith(".ttl") for f in created_files])
    # Check permissions
    expected_flags = [stat.S_IRUSR, stat.S_IRGRP, stat.S_IROTH]
    unexpected_flags = [stat.S_IWUSR, stat.S_IWGRP, stat.S_IWOTH]
    for f in created_files:
        st = os.stat(os.path.join(tmp_dir, f))
        for flag, non_flag in zip(expected_flags, unexpected_flags):
            assert bool(st.st_mode & flag)
            assert not bool(st.st_mode & non_flag)

    csvw.close()
    assert len(os.listdir(tmp_dir)) == 0
Exemplo n.º 4
0
    def test(self):
        metadata = None
        if 'metadata' in option:
            metadata = option['metadata']

        try:
            csvw = CSVW(csv_file, metadata_url=metadata)
            
            
        except Exception as e:
            # this should be a negative test
            if TYPES[type]:
                traceback.print_exc()
            self.assertFalse(TYPES[type])
            return

        # if we get here this should be a positive test
        self.assertTrue(TYPES[type])

        # if we can parse it we should at least produce some embedded metadata
        self.assertNotEqual(csvw.metadata, None)
        # and the result should exists
        self.assertNotEqual(result_url, None)


        gr = Graph()
        result = gr.parse(result_url)
        converted_result = csvw.to_rdf()
    
        result.serialize('output_rdf/' + name + '.ttl', format='turtle')
        converted_result.serialize('output_rdf/generated' + name + '.ttl', format='turtle')
        
        self.assertTrue(compare.isomorphic(result, converted_result))
Exemplo n.º 5
0
def test_default_with_datatype():
    csvw = CSVW(
        csv_path='tests/virtual1.csv',
        metadata_path='tests/virtual1.default.datatype.csv-metadata.json')
    rdf_output = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    ns = Namespace("http://example.org/")

    for x in [1, 2]:
        active_vals = list(
            g.triples((ns['sub-{}'.format(x)], ns['active'], None)))
        assert len(active_vals) == 1
        active_val = active_vals[0][2]
        assert isinstance(active_val, Literal)
        assert active_val.datatype == XSD.boolean
        assert active_val.value

        string_vals = list(
            g.triples((ns['sub-{}'.format(x)], ns['stringprop1'], None)))
        assert len(string_vals) == 1
        string_val = string_vals[0][2]
        assert isinstance(string_val, Literal)
        assert string_val.value == "some string"

        string_vals = list(
            g.triples((ns['sub-{}'.format(x)], ns['stringprop2'], None)))
        assert len(string_vals) == 1
        string_val = string_vals[0][2]
        assert isinstance(string_val, Literal)
        assert "%20" not in string_val.value
Exemplo n.º 6
0
def test_null_values_with_single_string():
    csvw = CSVW(csv_path="tests/null1.csv",
                metadata_path="tests/null1.single.csv-metadata.json")
    rdf_contents = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="turtle")

    # There should be no subject NA
    all_subjects = {x for x in g.subjects()}
    assert subj_ns['null_key'] not in all_subjects
    assert subj_ns['1'] in all_subjects
    assert len(all_subjects) == 4

    # Null valued objects should not be created
    all_objects = {x for x in g.objects()}
    assert Literal('null_key', datatype=XSD.token) not in all_objects
    assert Literal('null_sector') not in all_objects
    assert Literal('null_id', datatype=XSD.token) not in all_objects
    assert Literal('PUBLIC') in all_objects
    assert Literal('12', datatype=XSD.token) in all_objects

    # Spot check some triples do not exist but other do from the same row
    null_key_lit = Literal('null_id', datatype=XSD.token)
    assert len(list(g.triples((subj_ns['2'], id_uri, null_key_lit)))) == 0

    priv_lit = Literal('PRIVATE')
    assert len(list(g.triples((subj_ns['2'], sect_uri, priv_lit)))) == 1

    null_sector_lit = Literal('null_sector')
    assert len(list(g.triples((subj_ns['3'], sect_uri, null_sector_lit)))) == 0

    twelve_lit = Literal('12', datatype=XSD.token)
    assert len(list(g.triples((subj_ns['3'], id_uri, twelve_lit)))) == 1
Exemplo n.º 7
0
def test_literals_with_new_lines():
    csv_path = "tests/parsing.quoted_newlines.csv"
    metadata_path = "tests/parsing.quoted_newlines.csv-metadata.json"
    csvw = CSVW(csv_path=csv_path, metadata_path=metadata_path)

    rdf_contents = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="turtle")

    ns = Namespace("http://example.org/expense/")
    desc = URIRef("http://example.org/desc")

    taxi_triples = list(g.triples((ns['taxi'], desc, None)))
    assert len(taxi_triples) == 1
    taxi_desc = taxi_triples[0][2]
    assert isinstance(taxi_desc, Literal)
    assert len(taxi_desc.value.splitlines()) == 2

    flight = URIRef("http://example.org/expense/multi-hop%20flight")
    flight_triples = list(g.triples((flight, desc, None)))
    assert len(flight_triples) == 1
    flight_desc = flight_triples[0][2]
    assert isinstance(flight_desc, Literal)
    assert len(flight_desc.value.splitlines()) == 4

    dinner_triples = list(g.triples((ns['dinner'], desc, None)))
    assert len(dinner_triples) == 1
    dinner_desc = dinner_triples[0][2]
    assert isinstance(dinner_desc, Literal)
    assert u'\u2019' in dinner_desc, "Expected to read unicode characters"
    assert u"('')" in dinner_desc, "Expected to read apostrophes"
Exemplo n.º 8
0
def test_single_table_using_path():
    csv_path = "tests/simple.csv"
    metadata_path = "tests/simple.csv-metadata.json"

    csvw = CSVW(csv_path=csv_path, metadata_path=metadata_path)
    rdf = csvw.to_rdf()

    verify_rdf(rdf)
Exemplo n.º 9
0
def test_empty():

    csvw = CSVW(csv_path="tests/empty.csv",
                metadata_path="tests/empty.csv-metadata.json")
    rdf_output = csvw.to_rdf()

    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    assert len(g) == 0
Exemplo n.º 10
0
def test_empty_boolean():
    csvw = CSVW(csv_path="tests/empty.csv",
                metadata_path="tests/empty.bool.csv-metadata.json")
    rdf_output = csvw.to_rdf()

    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    assert len(g) == 2
    assert len(list(g.triples((None, None, Literal(False))))) == 2

    csvw = CSVW(csv_path="tests/empty.csv",
                metadata_path="tests/empty.invalid_base.csv-metadata.json")
    rdf_output = csvw.to_rdf()

    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    assert len(g) == 0
Exemplo n.º 11
0
def test_multiple_tables_through_paths():
    metadata_path = "tests/multiple_tables.csv-metadata.json"
    csv1_path = "tests/multiple_tables.Name-ID.csv"
    csv2_path = "tests/multiple_tables.ID-Age.csv"

    with open(metadata_path, 'r') as metadata_f:
        metadata = io.StringIO(text(metadata_f.read()))

    csvw = CSVW(csv_path=(csv1_path, csv2_path), metadata_handle=metadata)
    rdf = csvw.to_rdf()

    verify_rdf(rdf)
Exemplo n.º 12
0
def test_single_table_using_handles():
    csv_path = "tests/simple.csv"
    metadata_path = "tests/simple.csv-metadata.json"

    with io.open(csv_path) as csv1_f, io.open(metadata_path,
                                              'r') as metadata_f:
        csv_handle = io.StringIO(csv1_f.read())
        metadata = io.StringIO(metadata_f.read())

    csvw = CSVW(csv_handle=csv_handle, metadata_handle=metadata)
    rdf = csvw.to_rdf()

    verify_rdf(rdf)
Exemplo n.º 13
0
def test_multiple_tables_through_handles():
    metadata_path = "tests/multiple_tables.csv-metadata.json"
    csv1_path = "tests/multiple_tables.Name-ID.csv"
    csv2_path = "tests/multiple_tables.ID-Age.csv"

    with io.open(metadata_path, 'r') as metadata_f, io.open(
            csv1_path) as csv1_f, io.open(csv2_path) as csv2_f:
        metadata = io.StringIO(metadata_f.read())
        csv1 = io.StringIO(csv1_f.read())
        csv2 = io.StringIO(csv2_f.read())

    csvw = CSVW(csv_handle=[csv1, csv2], metadata_handle=metadata)
    rdf = csvw.to_rdf()

    verify_rdf(rdf)
Exemplo n.º 14
0
def test_default():
    csvw = CSVW(csv_path='tests/virtual1.csv',
                metadata_path='tests/virtual1.default.csv-metadata.json')
    rdf_output = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    all_subjects = {x for x in g.subjects()}
    assert len(all_subjects) == 4

    ns = Namespace("http://example.org/")
    assert ns['sub-1'] in all_subjects
    assert ns['sub-2'] in all_subjects
    assert len([g.triples((ns['sub-1'], ns['obj-1'], ns['myvalue']))]) == 1
    assert len([g.triples((ns['sub-2'], ns['obj-2'], ns['myvalue']))]) == 1
Exemplo n.º 15
0
def test_single_table_using_url(mock_urlopen):
    csv_path = "tests/simple.csv"
    metadata_path = "tests/simple.csv-metadata.json"
    csv_url = "http://example.org/simple.csv"

    with io.open(csv_path) as csv1_f:
        csv1 = text(csv1_f.read())

    reader = Mock()
    reader.read.side_effect = [csv1]
    mock_urlopen.return_value = reader

    csvw = CSVW(csv_url=csv_url, metadata_path=metadata_path)
    rdf = csvw.to_rdf()

    verify_rdf(rdf)
Exemplo n.º 16
0
def test_encoding_rdf():
    # With encoding specified
    encoding = "ISO-8859-1"
    csvw = CSVW(csv_path="./tests/iso_encoding.csv",
                metadata_path="./tests/iso_encoding.csv-metadata.json",
                csv_encoding=encoding)
    rdf_output = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    units = Namespace('http://example.org/units/')
    cars = Namespace('http://example.org/cars/')
    meta = Namespace("http://example.org/properties/")

    expected_unit = units[quote(u"\xb5100".encode('utf-8'))]
    assert (cars['1'], meta['UnitOfMeasurement'], expected_unit) in g
    assert expected_unit in list(g.objects())
Exemplo n.º 17
0
def test_bool_with_format():
    csvw = CSVW(csv_path="tests/datatypes.bool.csv",
                metadata_path="tests/datatypes.bool.csv-metadata.json")
    rdf_output = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    true_lit = Literal(True, datatype=XSD.boolean)
    false_lit = Literal(False, datatype=XSD.boolean)

    assert len(list(g.triples((NS['event/1'], NS['bool1'], true_lit)))) == 1
    assert len(list(g.triples((NS['event/1'], NS['bool2'], true_lit)))) == 1
    assert len(list(g.triples((NS['event/1'], NS['bool3'], true_lit)))) == 1
    assert len(list(g.triples((NS['event/2'], NS['bool1'], false_lit)))) == 1
    assert len(list(g.triples((NS['event/2'], NS['bool2'], false_lit)))) == 1
    assert len(list(g.triples((NS['event/2'], NS['bool3'], false_lit)))) == 1
    assert len(list(g.triples((NS['event/3'], NS['bool1'], false_lit)))) == 1
    assert len(list(g.triples((NS['event/3'], NS['bool2'], false_lit)))) == 1
    assert len(list(g.triples((NS['event/3'], NS['bool3'], false_lit)))) == 1
Exemplo n.º 18
0
def test_null_values_with_multiple_strings():
    csvw = CSVW(csv_path="tests/null1.csv",
                metadata_path="tests/null1.multiple.csv-metadata.json")
    rdf_contents = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="turtle")

    all_objects = {x for x in g.objects()}

    assert Literal('null_key', datatype=XSD.token) not in all_objects
    assert Literal('null_sector') not in all_objects
    assert Literal('null_id', datatype=XSD.token) not in all_objects
    for id in ['10', '11', '12', '13']:
        assert Literal(id, datatype=XSD.token) not in all_objects

    all_preds = {x for x in g.predicates()}
    assert id_uri not in all_preds

    assert Literal('1', datatype=XSD.token) not in all_objects
Exemplo n.º 19
0
def test_multiple_tables_through_urls(mock_urlopen):
    metadata_path = "tests/multiple_tables.csv-metadata.json"
    csv1_url = "multiple_tables.Name-ID.csv"
    csv2_url = "multiple_tables.ID-Age.csv"
    csv1_path = "tests/multiple_tables.Name-ID.csv"
    csv2_path = "tests/multiple_tables.ID-Age.csv"

    with io.open(metadata_path, 'r') as metadata_f, io.open(
            csv1_path) as csv1_f, io.open(csv2_path) as csv2_f:
        metadata = io.StringIO(text(metadata_f.read()))
        csv1 = text(csv1_f.read())
        csv2 = text(csv2_f.read())

    reader = Mock()
    reader.read.side_effect = [csv1, csv2]
    mock_urlopen.return_value = reader

    csvw = CSVW(csv_url=(csv1_url, csv2_url), metadata_handle=metadata)
    rdf = csvw.to_rdf()

    verify_rdf(rdf)
Exemplo n.º 20
0
def test_literals_with_escaped_quotes():
    csv_path = "tests/parsing.escaped_quotes.csv"
    metadata_path = "tests/parsing.escaped_quotes.csv-metadata.json"
    csvw = CSVW(csv_path=csv_path, metadata_path=metadata_path)

    rdf_contents = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="turtle")

    ns = Namespace("http://example.org/expense/")
    desc = URIRef("http://example.org/desc")

    taxi_triples = list(g.triples((ns['taxi'], desc, None)))
    assert len(taxi_triples) == 1
    taxi_desc = taxi_triples[0][2]
    assert isinstance(taxi_desc, Literal)
    assert taxi_desc.value == "go from x to y"

    quoted_expense_triples = list(
        g.triples((URIRef("http://example.org/expense/quoted%20expense"), desc,
                   None)))
    assert len(quoted_expense_triples) == 1
    quoted_expense_desc = quoted_expense_triples[0][2]
    assert isinstance(quoted_expense_desc, Literal)
    assert quoted_expense_desc.value == "for some reason it came with quotes in it"

    flight_triples = list(g.triples((ns['flight'], desc, None)))
    assert len(flight_triples) == 1
    flight_desc = flight_triples[0][2]
    assert isinstance(flight_desc, Literal)
    assert flight_desc.value == "had to fly \"escaped quotes business\" for this trip"

    car_triples = list(g.triples((ns['car'], desc, None)))
    assert len(car_triples) == 1
    car_desc = car_triples[0][2]
    assert isinstance(car_desc, Literal)
    assert car_desc.value == " some \ in it to be escaped"
Exemplo n.º 21
0
def test_rdf_events_listing():
    # Generate rdf
    csvw = CSVW(
        csv_path="tests/examples/events-listing.csv",
        metadata_path="tests/examples/events-listing.csv-metadata.json")
    # RIOT throws relative IRI warnings for this example.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", RiotWarning)
        rdf_output = csvw.to_rdf()
    g = rdflib.Graph().parse(data=rdf_output, format="turtle")
    assert len(g) == 20, "Expected 20 triples"

    # Subjects
    subjects = set([x for x in g.subjects()])
    assert len(subjects) == 6, "There should be 6 subjects"
    expected_subjects = [
        "#event-1", "#event-2", "#place-1", "#place-2", "#offer-1", "#offer-2"
    ]
    for s in expected_subjects:
        assert any([s in str(x) for x in subjects
                    ]), "{} expected to be among subjects".format(s)

    predicates = set([x for x in g.predicates()])
    assert len(predicates) == 7, "There should be 7 predicates"
    schema = rdflib.Namespace("http://schema.org/")
    expected_preds = [
        rdflib.RDF.type, schema.name, schema.startDate, schema.location,
        schema.offers, schema.address, schema.url
    ]
    for p in expected_preds:
        assert p in expected_preds, "{} expected to be among predicates"

    events = [
        x[0] for x in g.triples((None, rdflib.RDF.type, schema.MusicEvent))
    ]
    assert len(events) == 2, "Expected 2 events"
    assert any(["#event-1" in str(e) for e in events])
    assert any(["#event-2" in str(e) for e in events])

    event_names = []
    for e in events:
        for x in g.triples((e, schema.name, None)):
            event_names.append(x[2])
    event_names = set(event_names)
    assert len(event_names) == 1, "There should be one unique event name"
    assert str(next(iter(event_names))
               ) == "B.B. King", "The unique event name should be 'B.B. King'"

    start_date_data = [(x[0], x[2])
                       for x in g.triples((None, schema.startDate, None))]
    assert len(start_date_data) == 2, "There should be 2 start dates"
    for s in start_date_data:
        assert s[
            1].datatype == rdflib.XSD.dateTime, "start dates should be xsd:dateTime"
    events = [s[0] for s in start_date_data]
    assert any(["#event-1" in str(e) for e in events])
    assert any(["#event-2" in str(e) for e in events])

    location_data = [(x[0], x[2])
                     for x in g.triples((None, schema.location, None))]
    assert len(location_data) == 2, "There should be 2 locations"
    locations = [x[1] for x in location_data]
    assert any(["#place-1" in str(l) for l in locations])
    assert any(["#place-2" in str(l) for l in locations])
    events = [s[0] for s in location_data]
    assert any(["#event-1" in str(e) for e in events])
    assert any(["#event-2" in str(e) for e in events])

    offer_data = [(x[0], x[2]) for x in g.triples((None, schema.offers, None))]
    assert len(offer_data) == 2, "There should be 2 locations"
    offers = [x[1] for x in offer_data]
    assert any(["#offer-1" in str(l) for l in offers])
    assert any(["#offer-2" in str(l) for l in offers])
    events = [s[0] for s in offer_data]
    assert any(["#event-1" in str(e) for e in events])
    assert any(["#event-2" in str(e) for e in events])

    offers = [x for x in g.triples((None, rdflib.RDF.type, schema.Offer))]
    assert len(offers) == 2, "There should be 2 offers"
    assert any(["#offer-1" in str(e) for e in offers])
    assert any(["#offer-2" in str(e) for e in offers])

    urls = [x for x in g.triples((None, schema.url, None))]
    assert len(urls) == 2, "There should be 2 urls"
    for u in urls:
        assert u[
            2].datatype == rdflib.XSD.anyURI, "urls should be of type xsd:anyURI"

    places = [x for x in g.triples((None, rdflib.RDF.type, schema.Place))]
    assert len(places) == 2, "There should be 2 places"
    for p in places:
        assert len(list(g.triples(
            (p[0], schema.name, None)))) == 1, "Place should have one name"
        assert len(list(g.triples(
            (p[0], schema.address,
             None)))) == 1, "Place should have one address"
Exemplo n.º 22
0
def test_rdf_tree_ops_ext():
    # Generate rdf

    csvw = CSVW(csv_path="tests/examples/tree-ops-ext.csv", metadata_path="tests/examples/tree-ops-ext.csv-metadata.json")
    rdf_output = csvw.to_rdf()
    g = rdflib.Graph().parse(data=rdf_output, format="turtle")

    # Validate rdf
    total_triples = (9-1)*3 - 2 + 7 # 9 columns, 1 suppressed, two null comments and 7 separated comments
    assert len(g) == total_triples, "Expecting a total of {} triples".format(total_triples)

    table_url = "http://example.org/tree-ops-ext.csv"
    # Subject
    subjects = set([x for x in g.subjects()])
    assert len(subjects) == 3, "Expecting 3 subjects, one for each row"
    # Subjects should be named as specified by aboutURL and primary key
    expected_preds = ["on_street", "species", "trim_cycle", "dbh",
                      "inventory_date", "protected", "kml"]
    pred_comments = rdflib.URIRef("{}#comments".format(table_url))
    for ind in [1, 2, 6]:
        uri = rdflib.URIRef("http://example.org/tree-ops-ext#gid-{}".format(ind))
        assert uri in subjects, "{} not among subjects".format(uri)
        triples = [t for t in g.triples((uri, None, None))]
        preds = [x[1] for x in triples]
        for p in expected_preds:
            p_uri = rdflib.URIRef("{}#{}".format(table_url, p))
            assert p_uri in preds, "Predicate {} expected to be in triples of {}".format(
                p_uri, uri)

        if ind == 6:
            assert len(triples) == 15, "{} expected to have 15 triples".format(uri)
            assert pred_comments in preds, "Predicate {} should not be there for {}".format(
                pred_comments, uri)
        else:
            assert len(triples) == 7, "{} expected to have 7 triples".format(uri)
            assert pred_comments not in preds, "Predicate {} should be there for {}".format(
                pred_comments, uri)

    # Predicates
    predicates = set([x for x in g.predicates()])
    assert len(predicates) == 8, "Expecting 8 different predicates"

    # Objects
    on_street_objs = []
    for t in g.triples((None, rdflib.URIRef("{}#on_street".format(table_url)), None)):
        on_street_objs.append(t[2])
    assert len(on_street_objs) == 3, "There should be 3 on_street triples"
    for o in on_street_objs:
        assert o.datatype is None, "datatype should not be specified for on_street data"
        assert o.language is None, "language should not be specified for on_street data"

    species_objs = []
    for t in g.triples((None, rdflib.URIRef("{}#species".format(table_url)), None)):
        species_objs.append(t[2])
    assert len(species_objs) == 3, "There should be 3 species triples"
    for o in species_objs:
        assert o.datatype is None, "datatype should not be specified for species data"
        assert o.language is None, "language should not be specified for species data"

    trim_cycle_objs = []
    for t in g.triples((None, rdflib.URIRef("{}#trim_cycle".format(table_url)), None)):
        trim_cycle_objs.append(t[2])
    assert len(trim_cycle_objs) == 3, "There should be 3 trim_cycle triples"
    for o in trim_cycle_objs:
        assert o.datatype is None, "datatype should not be specified for trim_cycle data"
        assert str(o.language) == "en", "language should be specified as en for trim_cycle data"

    dbh_objs = []
    for t in g.triples((None, rdflib.URIRef("{}#dbh".format(table_url)), None)):
        dbh_objs.append(t[2])
    assert len(dbh_objs) == 3, "There should be 3 dbh triples"
    for o in dbh_objs:
        assert isinstance(o.value, long), "dbh should be an integer"
        assert o.datatype == rdflib.XSD.integer, "datatype should be specified as integer for dbh data"
        assert o.language is None, "language should not be specified for dbh data"

    inv_date_objs = []
    for t in g.triples((None, rdflib.URIRef("{}#inventory_date".format(table_url)), None)):
        inv_date_objs.append(t[2])
    assert len(inv_date_objs) == 3, "There should be 3 inventory_date triples"
    for o in inv_date_objs:
        assert isinstance(o.value, date), "inventory_date should be a date"
        assert o.value.year == 2010, "inventory_date year should be 2010"
        assert o.datatype == rdflib.XSD.date, "datatype should be specified as date for inventory_date data"
        assert o.language is None, "language should not be specified for dbh data"

    protected_objs = []
    for t in g.triples((None, rdflib.URIRef("{}#protected".format(table_url)), None)):
        protected_objs.append(t[2])
    assert len(protected_objs) == 3, "There should be 3 protected triples"
    for o in protected_objs:
        assert isinstance(o.value, bool), "protected should be a bool"
        assert o.datatype == rdflib.XSD.boolean, "datatype should be specified as bool for protected data"
        assert o.language is None, "language should not be specified for protected data"

    kml_objs = []
    for t in g.triples((None, rdflib.URIRef("{}#kml".format(table_url)), None)):
        kml_objs.append(t[2])
    assert len(kml_objs) == 3, "There should be 3 dbh triples"
    for o in kml_objs:
        assert isinstance(o.value, xml.dom.minidom.Document), "kml should be an XML document"
        assert o.datatype == rdflib.RDF.XMLLiteral, "datatype should be specified as xml for kml data"
        assert o.language is None, "language should not be specified for kml data"

    comments_objs = []
    for t in g.triples((None, pred_comments, None)):
        comments_objs.append(t[2])
    assert len(comments_objs) == 8, "There should be 8 comments"
    for o in comments_objs:
        assert o.datatype is None, "datatype should not be specified for comments data"
        assert o.language is None, "language should not be specified for comments data"
Exemplo n.º 23
0
def test_group_of_tables(mock_urlopen):
    mock_urlopen.side_effect = dispatch_files_as_url
    csv_urls = [
        "http://example.org/gov.uk/data/organizations.csv",
        "http://example.org/gov.uk/data/professions.csv",
        "http://example.org/senior-roles.csv",
        "http://example.org/junior-roles.csv"
    ]
    csvw = CSVW(csv_url=csv_urls,
                metadata_url="http://example.org/csv-metadata.json")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", RiotWarning)
        rdf_output = csvw.to_rdf()
    g = rdflib.Graph().parse(data=rdf_output, format="turtle")

    org = Namespace("http://www.w3.org/ns/org#")
    post_in = URIRef("http://example.org/organization/hefce.ac.uk")
    grade = URIRef("http://example.org/gov.uk/def/grade")
    job = URIRef("http://example.org/gov.uk/def/job")
    prof = URIRef("http://example.org/gov.uk/def/profession")
    post = Namespace("http://example.org/organization/hefce.ac.uk/post/")
    person = Namespace("http://example.org/organization/hefce.ac.uk/person/")
    min_pay = URIRef("http://example.org/gov.uk/def/min_pay")
    max_pay = URIRef("http://example.org/gov.uk/def/max_pay")
    num_posts = URIRef("http://example.org/gov.uk/def/number_of_posts")

    post_90115 = post["90115"]
    post_90334 = post["90334"]
    p1 = person["1"]
    p2 = person["2"]

    post_90115_triples = list(g.triples((post_90115, None, None)))
    assert len(post_90115_triples) == 7
    assert (post_90115, DCTERMS.identifier,
            Literal("90115")) in post_90115_triples
    assert (post_90115, org.heldBy, p1) in post_90115_triples
    assert (post_90115, grade, Literal("SCS1A")) in post_90115_triples
    assert (post_90115, job,
            Literal("Deputy Chief Executive")) in post_90115_triples
    assert (post_90115, org.reportsTo, post_90334) in post_90115_triples
    assert (post_90115, prof, Literal("Finance")) in post_90115_triples
    assert (post_90115, org.postIn, post_in) in post_90115_triples

    p1_triples = list(g.triples((p1, None, None)))
    assert len(p1_triples) == 1
    assert (p1, FOAF.name, Literal("Steve Egan")) in p1_triples

    post_90334_triples = list(g.triples((post_90334, None, None)))
    assert len(post_90334_triples) == 6
    assert (post_90334, DCTERMS.identifier,
            Literal("90334")) in post_90334_triples
    assert (post_90334, org.heldBy, p2) in post_90334_triples
    assert (post_90334, grade, Literal("SCS4")) in post_90334_triples
    assert (post_90334, job, Literal("Chief Executive")) in post_90334_triples
    assert (post_90334, prof, Literal("Policy")) in post_90334_triples
    assert (post_90334, org.postIn, post_in) in post_90334_triples

    p2_triples = list(g.triples((p2, None, None)))
    assert len(p2_triples) == 1
    assert (p2, FOAF.name, Literal("Sir Alan Langlands")) in p2_triples

    bnode1 = list(g.triples((None, grade, Literal("4"))))[0][0]
    b1_triples = list(g.triples((bnode1, None, None)))
    assert len(b1_triples) == 8
    assert (bnode1, org.reportsTo, post_90115) in b1_triples
    assert (bnode1, min_pay, Literal(17426,
                                     datatype=XSD.integer)) in b1_triples
    assert (bnode1, max_pay, Literal(20002,
                                     datatype=XSD.integer)) in b1_triples
    assert (bnode1, job, Literal("Administrator")) in b1_triples
    assert (bnode1, num_posts, Literal(8.67,
                                       datatype=XSD.double)) in b1_triples
    assert (bnode1, prof, Literal("Operational Delivery")) in b1_triples
    assert (bnode1, org.postIn, post_in) in b1_triples

    bnode2 = list(g.triples((None, grade, Literal("5"))))[0][0]
    b2_triples = list(g.triples((bnode2, None, None)))
    assert len(b2_triples) == 8
    assert (bnode2, org.reportsTo, post_90115) in b2_triples
    assert (bnode2, min_pay, Literal(19546,
                                     datatype=XSD.integer)) in b2_triples
    assert (bnode2, max_pay, Literal(22478,
                                     datatype=XSD.integer)) in b2_triples
    assert (bnode2, job, Literal("Administrator")) in b2_triples
    assert (bnode2, num_posts, Literal(0.5, datatype=XSD.double)) in b2_triples
    assert (bnode2, prof, Literal("Operational Delivery")) in b2_triples
    assert (bnode2, org.postIn, post_in) in b2_triples

    assert len(list(g.triples((None, None, None)))) == 7 + 1 + 6 + 1 + 8 + 8
Exemplo n.º 24
0
def test_multiple_value_urls_in_virtual():
    csvw = CSVW(csv_path="tests/value_urls.csv",
                metadata_path="tests/value_urls.csv-metadata.json")
    rdf_contents = csvw.to_rdf(fmt="nt")
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="nt")

    # Test subjects
    all_subjects = list(g.subjects())
    s_amount = NS['amount']
    s_desc = NS['description']
    s_id = NS['id']
    assert s_amount in all_subjects
    assert s_desc in all_subjects
    assert s_id in all_subjects

    # Test descriptions
    p_def = NS['definition']
    assert len(list(g.triples(
        (s_amount, p_def, Literal("the amount paid"))))) == 1
    assert len(
        list(g.triples(
            (s_desc, p_def, Literal("description of the expense"))))) == 1
    assert len(list(g.triples((s_id, p_def, Literal("transaction id"))))) == 1

    # Test each is a element type
    o_element = NS['element']
    assert len(list(g.triples((s_amount, RDF.type, o_element)))) == 1
    assert len(list(g.triples((s_desc, RDF.type, o_element)))) == 1
    assert len(list(g.triples((s_id, RDF.type, o_element)))) == 1

    # Test that range is specified
    r_amount = NS['element/amount-RANGE']
    r_desc = NS['element/description-RANGE']
    r_id = NS['element/id-RANGE']

    assert len(list(g.triples((s_amount, RDFS.range, r_amount)))) == 1
    assert len(list(g.triples((s_desc, RDFS.range, r_desc)))) == 1
    assert len(list(g.triples((s_id, RDFS.range, r_id)))) == 1

    # Range is another subject
    assert r_amount in all_subjects
    assert r_desc in all_subjects
    assert r_id in all_subjects

    # Range is a OWL datatype of specified type
    assert len(list(g.triples((r_amount, OWL.onDatatype, XSD.decimal)))) == 1
    assert len(list(g.triples((r_desc, OWL.onDatatype, XSD.string)))) == 1
    assert len(list(g.triples((r_id, OWL.onDatatype, XSD.integer)))) == 1

    # Check the restrictions for amount
    rest_amount_node = list(g.triples((r_amount, OWL.withRestrictions, None)))
    rest_amount_node = rest_amount_node[0][2]
    assert isinstance(rest_amount_node, BNode)
    assert len(list(g.triples(
        (rest_amount_node, RDF.first, XSD.decimal)))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(list(g.triples(
        (rest_amount_node, RDF.first, XSD.MaxLength)))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_amount_node, RDF.first,
                       Literal(10, datatype=XSD.nonNegativeInteger))))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(list(g.triples(
        (rest_amount_node, RDF.first, XSD.MinLength)))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_amount_node, RDF.first,
                       Literal(1, datatype=XSD.nonNegativeInteger))))) == 1
    rest_amount_node = next(
        g.objects(subject=rest_amount_node, predicate=RDF.rest))
    assert len(list(g.triples((rest_amount_node, RDF.first, None)))) == 0
    assert len(list(g.triples((rest_amount_node, RDF.rest, None)))) == 0

    # Check the restrictions for description
    rest_desc_node = list(g.triples((r_desc, OWL.withRestrictions, None)))
    rest_desc_node = rest_desc_node[0][2]
    assert isinstance(rest_desc_node, BNode)
    assert len(list(g.triples((rest_desc_node, RDF.first, XSD.string)))) == 1
    rest_desc_node = next(g.objects(subject=rest_desc_node,
                                    predicate=RDF.rest))
    assert len(list(g.triples(
        (rest_desc_node, RDF.first, XSD.MaxLength)))) == 1
    rest_desc_node = next(g.objects(subject=rest_desc_node,
                                    predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_desc_node, RDF.first,
                       Literal(100, datatype=XSD.nonNegativeInteger))))) == 1
    rest_desc_node = next(g.objects(subject=rest_desc_node,
                                    predicate=RDF.rest))
    assert len(list(g.triples((rest_desc_node, RDF.first, None)))) == 0
    assert len(list(g.triples((rest_desc_node, RDF.rest, None)))) == 0

    # Check the restrictions for id
    rest_id_node = list(g.triples((r_id, OWL.withRestrictions, None)))
    rest_id_node = rest_id_node[0][2]
    assert isinstance(rest_id_node, BNode)
    assert len(list(g.triples((rest_id_node, RDF.first, XSD.integer)))) == 1
    rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest))
    assert len(list(g.triples((rest_id_node, RDF.first, XSD.MinLength)))) == 1
    rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest))
    assert len(
        list(
            g.triples((rest_id_node, RDF.first,
                       Literal(0, datatype=XSD.nonNegativeInteger))))) == 1
    rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest))
    assert len(list(g.triples((rest_id_node, RDF.first, None)))) == 0
    assert len(list(g.triples((rest_id_node, RDF.rest, None)))) == 0

    # Check constant value for each
    const_prop = NS['another-list-value-with-constants']
    for s in [r_amount, r_id, r_desc]:
        constant_node = list(g.triples((r_amount, const_prop, None)))
        constant_node = constant_node[0][2]
        assert isinstance(constant_node, BNode)
        assert len(list(g.triples(
            (constant_node, RDF.first, XSD.Length)))) == 1
        constant_node = next(
            g.objects(subject=constant_node, predicate=RDF.rest))
        assert len(
            list(
                g.triples((constant_node, RDF.first,
                           Literal(1, datatype=XSD.nonNegativeInteger))))) == 1
        constant_node = next(
            g.objects(subject=constant_node, predicate=RDF.rest))
        assert len(list(g.triples((constant_node, RDF.first, None)))) == 0
        assert len(list(g.triples((constant_node, RDF.rest, None)))) == 0

    # Verify that empty valueUrl does not end up in graph or rdf contents
    assert NS['empty-list-predicate1'] not in list(g.objects())
    assert "empty-list-predicate1" not in rdf_contents

    # Verify that empty valueUrl does not end up in graph
    assert NS['empty-list-predicate2'] not in list(g.objects())
    assert "empty-list-predicate2" not in rdf_contents

    # Test total number of lists through rdf:nils in order to verify each list
    # ends up with a nil
    test_num_lists = 3 * 3  # 3 rows and 3 virtual list valued columns
    nil_text = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ."
    assert rdf_contents.count(nil_text) == test_num_lists
Exemplo n.º 25
0
def test_negative_invalid_column():
    csvw = CSVW(csv_path='tests/virtual1.csv',
                metadata_path='tests/virtual1.negative2.csv-metadata.json')

    with pytest.raises(FailedSubstitutionError):
        print(csvw.to_rdf())
TRIPLES_SIZES = [
    20000, 30000, 50000, 100000, 200000, 300000, 500000, 1000000, 2000000
]
FORMATS = ["turtle", "xml", "json-ld"]

for fmt in FORMATS:
    print "|Number of triples|pycsvw {fmt} (sec)|rdflib {fmt} (sec)|".format(
        fmt=fmt)
    for num_triples in TRIPLES_SIZES:
        generate_csv_and_metadata(num_triples)
        start = time.time()
        # Generate nt first for fairness to pycsvw
        csvw = CSVW(
            csv_path="csvfile.{}.csv".format(num_triples),
            metadata_path="csvfile.{}.csv-metadata.json".format(num_triples))
        pycsvw_output = csvw.to_rdf(fmt)
        with open(
                "{fmt}file.{num_t}.pycsvw.{fmt}".format(fmt=fmt,
                                                        num_t=num_triples),
                "w") as out_file:
            out_file.write(pycsvw_output.encode("utf-8"))
        pycsvw_time = time.time() - start

        # Write the same contents into an nt-file using rdflib
        num_rows = int(num_triples) / NUM_COLS
        start = time.time()
        g = ConjunctiveGraph()
        for row in xrange(num_rows):
            for col in xrange(NUM_COLS):
                g.add(
                    (URIRef(
Exemplo n.º 27
0
from speed_test_csv import NUM_COLS, generate_csv_and_metadata

TRIPLES_SIZES = [
    20000, 30000, 50000, 100000, 200000, 300000, 500000, 1000000, 2000000
]

print "|Number of triples|pycsvw (sec)|rdflib (sec)|"
for num_triples in TRIPLES_SIZES:
    # Generate csv and its metadata
    generate_csv_and_metadata(num_triples)
    # Generate NT using pycsvw
    start = time.time()
    csvw = CSVW(
        csv_path="csvfile.{}.csv".format(num_triples),
        metadata_path="csvfile.{}.csv-metadata.json".format(num_triples))
    nt_output = csvw.to_rdf("nt")
    with open("ntfile.{}.pycsvw.nt".format(num_triples), "w") as nt_file:
        nt_file.write(nt_output.encode("utf-8"))
    pycsvw_nt_time = time.time() - start

    # Generate equivalent contents using rdflib
    num_rows = int(num_triples) / NUM_COLS
    start = time.time()
    g = ConjunctiveGraph()

    for row in xrange(num_rows):
        for col in xrange(NUM_COLS):
            g.add(
                (URIRef("http://www.example.org/subjectrow{}col0".format(row)),
                 URIRef("http://www.example.org/predcolumn{}".format(col)),
                 Literal("row{}col{}".format(row, col))))
Exemplo n.º 28
0
def test_url_safe_chars():

    csvw = CSVW(csv_path="tests/url_special_chars.csv",
                metadata_path="tests/url_special_chars.csv-metadata.json")
    rdf_output = csvw.to_rdf()

    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")

    # Check subjects
    sub1 = URIRef(
        'http://www.example.org/c#1/chash2/chash3/chash4/chash5/chash6')
    literals = [
        Literal('c#1'),
        Literal('chash2'),
        Literal('chash3'),
        Literal('chash4'),
        Literal('chash6'),
        Literal('chash5')
    ]
    verify_non_virtual_columns(sub1, g, literals)
    verify_virtual_columns(sub1, g, '#/:- _r1', '#/:-%20_r1')

    sub2 = URIRef('http://www.example.org/c/1/c/2/c/3/c/4/c/5/c/6')
    literals = [
        Literal('c/1'),
        Literal('c/2'),
        Literal('c/3'),
        Literal('c/4'),
        Literal('c/6'),
        Literal('c/5')
    ]
    verify_non_virtual_columns(sub2, g, literals)
    verify_virtual_columns(sub2, g, '/#:- _r2', '/#:-%20_r2')

    sub3 = URIRef('http://www.example.org/c:1/c:2/c:3/c:4/c:5/c:6')
    literals = [
        Literal('c:1'),
        Literal('c:2'),
        Literal('c:3'),
        Literal('c:4'),
        Literal('c:6'),
        Literal('c:5')
    ]
    verify_non_virtual_columns(sub3, g, literals)
    verify_virtual_columns(sub3, g, ':#/-_ r3', ':#/-_%20r3')

    sub4 = URIRef('http://www.example.org/c-1/c-2/c-3/c-4/c-5/c-6')
    literals = [
        Literal('c-1'),
        Literal('c-2'),
        Literal('c-3'),
        Literal('c-4'),
        Literal('c-6'),
        Literal('c-5')
    ]
    verify_non_virtual_columns(sub4, g, literals)
    verify_virtual_columns(sub4, g, '-/#_ :r4', '-/#_%20:r4')

    sub5 = URIRef('http://www.example.org/c%201/c%202/c%203/c%204/c%205/c%206')
    literals = [
        Literal('c 1'),
        Literal('c 2'),
        Literal('c 3'),
        Literal('c 4'),
        Literal('c 6'),
        Literal('c 5')
    ]
    verify_non_virtual_columns(sub5, g, literals)
    verify_virtual_columns(sub5, g, ' -/#:_r5', '%20-/#:_r5')

    sub6 = URIRef('http://www.example.org/c_1/c_2/c_3/c_4/c_5/c_6')
    literals = [
        Literal('c_1'),
        Literal('c_2'),
        Literal('c_3'),
        Literal('c_4'),
        Literal('c_6'),
        Literal('c_5')
    ]
    verify_non_virtual_columns(sub6, g, literals)
    verify_virtual_columns(sub6, g, '_ /:#r6', '_%20/:#r6')
Exemplo n.º 29
0
def test_individual_formats(fmt, validate_func, rdflib_input):
    csvw = CSVW(csv_path="./tests/books.csv",
                metadata_path="./tests/books.csv-metadata.json")
    rdf_output = csvw.to_rdf(fmt=fmt)
    validate_func(rdf_output)
    verify_rdf_contents(rdf_output, rdflib_input)