示例#1
0
    def get(self, portalid, datasetid):
        if request.headers.get('Accept-Datetime'):
            acc_dt = request.headers['Accept-Datetime']
            sn = getSnapshotfromTime(parse_rfc1123(acc_dt))
        else:
            sn = getCurrentSnapshot()


        session = current_app.config['dbsession']
        p = session.query(Portal).filter(Portal.id == portalid).first()
        q = session.query(DatasetQuality) \
            .join(Dataset, DatasetQuality.md5 == Dataset.md5) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        dataset_qual = q.first()

        q = session.query(Dataset) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        dataset = q.first()
        # get rdf graph and add measures and dimensions
        g, ds_id = dqv_export._get_measures_for_dataset(p, dataset, dataset_qual)
        dqv_export.add_dimensions_and_metrics(g)
        resp = jsonify(json.loads(g.serialize(format="json-ld")))
        timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/dqv>'
        resp = add_memento_header(resp, ds_id.n3(), timegate, sn)

        resp.headers['Vary'] = 'accept-datetime'
        d = tofirstdayinisoweek(sn)
        full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/dqv>'
        resp.headers['Content-Location'] = full_url
        return resp
示例#2
0
def dataset_quality_to_dqv(graph, ds_id, datasetquality, snapshot, fetch_activity=None):
    sn_time = utils_snapshot.tofirstdayinisoweek(snapshot)

    # BNodes: ds_id + snapshot + metric + value
    # add quality metrics to graph
    # TODO should we use portalwatch URI?
    for metric, value in [(PWQ.Date, datasetquality.exda), (PWQ.Rights, datasetquality.exri), (PWQ.Preservation, datasetquality.expr),
                          (PWQ.Access, datasetquality.exac), (PWQ.Discovery, datasetquality.exdi), (PWQ.Contact, datasetquality.exco),
                          (PWQ.ContactURL, datasetquality.cocu), (PWQ.DateFormat, datasetquality.coda), (PWQ.FileFormat, datasetquality.cofo),
                          (PWQ.ContactEmail, datasetquality.coce), (PWQ.License, datasetquality.coli), (PWQ.AccessURL, datasetquality.coac),
                          (PWQ.OpenFormat, datasetquality.opfo), (PWQ.MachineRead, datasetquality.opma), (PWQ.OpenLicense, datasetquality.opli)]:
        # add unique BNodes
        bnode_hash = hashlib.sha1(ds_id.n3() + str(snapshot) + metric.n3() + str(value))
        m = BNode(bnode_hash.hexdigest())

        graph.add((m, DQV.isMeasurementOf, metric))
        graph.add((m, DQV.value, Literal(value)))

        # add additional triples
        graph.add((ds_id, DQV.hasQualityMeasurement, m))
        graph.add((m, RDF.type, DQV.QualityMeasurement))
        graph.add((m, DQV.computedOn, ds_id))
        if fetch_activity:
            # add prov to each measure
            quality_prov(m, ds_id, sn_time, fetch_activity, graph)
示例#3
0
    def get(self, portalid, datasetid):
        if request.headers.get('Accept-Datetime'):
            acc_dt = request.headers['Accept-Datetime']
            sn = getSnapshotfromTime(parse_rfc1123(acc_dt))
        else:
            sn = getCurrentSnapshot()

        session = current_app.config['dbsession']

        q = session.query(DatasetData) \
            .join(Dataset, DatasetData.md5 == Dataset.md5) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        data = q.first()
        p = session.query(Portal).filter(Portal.id == portalid).first()
        doc = dcat_to_schemadotorg.convert(p, data.raw)
        timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/schemadotorg>'
        resp = add_memento_header(jsonify(doc), '<' + doc['@id'] + '>', timegate, sn)

        resp.headers['Vary'] = 'accept-datetime'
        d = tofirstdayinisoweek(sn)
        full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/schemadotorg>'
        resp.headers['Content-Location'] = full_url
        return resp
示例#4
0
def generate_schemadotorg_files(obj):
    P, dbConf, snapshot, dir = obj[0], obj[1], obj[2], obj[3]
    sitemap_urls = []

    dbm = DBManager(**dbConf)
    session = scoped_session(sessionmaker(bind=dbm.engine))

    log.info("Start schema.org files", portal=P.id)
    portal_dir = dir + '/' + P.id
    if not os.path.exists(portal_dir):
        os.mkdir(portal_dir)

    p = session.query(Portal).filter(Portal.id == P.id).first()
    q = session.query(Dataset) \
        .filter(Dataset.snapshot == snapshot) \
        .filter(Dataset.portalid == P.id)
    datasetlist = []
    i = 0

    for d in q.all():
        try:
            q = session.query(DatasetData) \
                .join(Dataset, DatasetData.md5 == Dataset.md5) \
                .filter(Dataset.snapshot == snapshot) \
                .filter(Dataset.portalid == P.id) \
                .filter(Dataset.id == d.id)
            data = q.first()
            doc = dcat_to_schemadotorg.convert(p, data.raw)

            dataset_filename = base64.urlsafe_b64encode(d.id)
            dataset_file = portal_dir + "/" + dataset_filename
            create_schemadotorg(doc, dataset_file)
            if not d.title:
                t = d.id
            else:
                t = d.title

            datasetlist.append((t, dataset_filename))
            dt = data.modified
            if dt != None and dt < datetime.datetime(year=1980, month=1,
                                                     day=1):
                dt = None
            sitemap_urls.append((dataset_file, dt))
            i += 1
            if i % 50000 == 0:
                log.info("Processed datasets", datasets=str(i))

        except Exception as exc:
            ErrorHandler.handleError(log,
                                     "CreateSchema.orgFile",
                                     exception=exc,
                                     pid=P.id,
                                     snapshot=snapshot,
                                     exc_info=True)

    create_datasetlist(datasetlist, portal_dir)
    create_sitemap(sitemap_urls, portal_dir)

    dt = tofirstdayinisoweek(snapshot)
    return P, dt, snapshot
示例#5
0
def adequate_prov(graph, snapshot):
    ad_activity = URIRef("http://www.adequate.at/csvprofiler/" + str(snapshot))
    graph.add((ad_activity, RDF.type, PROV.Activity))
    graph.add((ad_activity, PROV.startedAtTime,
               Literal(tofirstdayinisoweek(snapshot))))
    graph.add(
        (ad_activity, PROV.endedAtTime, Literal(toLastdayinisoweek(snapshot))))
    graph.add((ad_activity, PROV.wasAssociatedWith, AD_AGENT))
    return ad_activity
示例#6
0
def csvw_prov(graph, snapshot):
    csvw_activity = URIRef("http://data.wu.ac.at/portalwatch/csvw/" +
                           str(snapshot))
    graph.add((csvw_activity, RDF.type, PROV.Activity))
    graph.add((csvw_activity, PROV.startedAtTime,
               Literal(tofirstdayinisoweek(snapshot))))
    graph.add((csvw_activity, PROV.endedAtTime,
               Literal(toLastdayinisoweek(snapshot))))
    graph.add((csvw_activity, PROV.wasAssociatedWith, PW_AGENT))
    return csvw_activity
示例#7
0
    def get(self, portalid, datasetid):
        if request.headers.get('Accept-Datetime'):
            acc_dt = request.headers['Accept-Datetime']
            sn = getSnapshotfromTime(parse_rfc1123(acc_dt))
        else:
            sn = getCurrentSnapshot()

        resp = get_dataset(portalid, sn, datasetid)
        resp.headers['Vary'] = 'accept-datetime'
        d = tofirstdayinisoweek(sn)
        full_url = HOST + '/' + portalid + '/' + d.strftime("%Y%m%d") + '/' + datasetid
        resp.headers['Content-Location'] = full_url
        return resp
示例#8
0
def add_memento_header(resp, orig_ref, timegate, snapshot):
    # timestamp of snapshot
    dt = tofirstdayinisoweek(snapshot)
    stamp = mktime(dt.timetuple())
    formatted_dt = formatdate(
        timeval=stamp,
        localtime=False,
        usegmt=True
    )
    resp.headers['Memento-Datetime'] = formatted_dt
    # link to original resource
    resp.headers['Link'] = orig_ref + '; rel="original", ' + timegate + '; rel="timegate"'
    return resp
示例#9
0
def get_dataset(portalid, snapshot, datasetid):
    session = current_app.config['dbsession']

    q = session.query(DatasetData) \
        .join(Dataset, DatasetData.md5 == Dataset.md5) \
        .filter(Dataset.snapshot <= snapshot) \
        .filter(Dataset.portalid == portalid) \
        .filter(Dataset.id == datasetid) \
        .order_by(Dataset.snapshot.desc())
    data = q.first()
    if data:
        resp = jsonify(row2dict(data))
        portal = session.query(Portal).filter(Portal.id == portalid).first()
        g = rdflib.Graph()
        dataset_ref = add_dcat_to_graph(data.raw, portal, g, None)
        timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '>'
        return add_memento_header(resp, dataset_ref.n3(), timegate, snapshot)
    else:
        resp = jsonify(
            {'error': 'There is no version of dataset ' + datasetid + ' available that is older than ' + str(tofirstdayinisoweek(snapshot)),
             'portalid': portalid})
        resp.status_code = 404
        return resp