def get(self, portalid, datasetid): if request.headers.get('Accept-Datetime'): acc_dt = request.headers['Accept-Datetime'] sn = getSnapshotfromTime(parse_rfc1123(acc_dt)) else: sn = getCurrentSnapshot() session = current_app.config['dbsession'] p = session.query(Portal).filter(Portal.id == portalid).first() q = session.query(DatasetQuality) \ .join(Dataset, DatasetQuality.md5 == Dataset.md5) \ .filter(Dataset.snapshot == sn) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.id == datasetid) dataset_qual = q.first() q = session.query(Dataset) \ .filter(Dataset.snapshot == sn) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.id == datasetid) dataset = q.first() # get rdf graph and add measures and dimensions g, ds_id = dqv_export._get_measures_for_dataset(p, dataset, dataset_qual) dqv_export.add_dimensions_and_metrics(g) resp = jsonify(json.loads(g.serialize(format="json-ld"))) timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/dqv>' resp = add_memento_header(resp, ds_id.n3(), timegate, sn) resp.headers['Vary'] = 'accept-datetime' d = tofirstdayinisoweek(sn) full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/dqv>' resp.headers['Content-Location'] = full_url return resp
def dataset_quality_to_dqv(graph, ds_id, datasetquality, snapshot, fetch_activity=None): sn_time = utils_snapshot.tofirstdayinisoweek(snapshot) # BNodes: ds_id + snapshot + metric + value # add quality metrics to graph # TODO should we use portalwatch URI? for metric, value in [(PWQ.Date, datasetquality.exda), (PWQ.Rights, datasetquality.exri), (PWQ.Preservation, datasetquality.expr), (PWQ.Access, datasetquality.exac), (PWQ.Discovery, datasetquality.exdi), (PWQ.Contact, datasetquality.exco), (PWQ.ContactURL, datasetquality.cocu), (PWQ.DateFormat, datasetquality.coda), (PWQ.FileFormat, datasetquality.cofo), (PWQ.ContactEmail, datasetquality.coce), (PWQ.License, datasetquality.coli), (PWQ.AccessURL, datasetquality.coac), (PWQ.OpenFormat, datasetquality.opfo), (PWQ.MachineRead, datasetquality.opma), (PWQ.OpenLicense, datasetquality.opli)]: # add unique BNodes bnode_hash = hashlib.sha1(ds_id.n3() + str(snapshot) + metric.n3() + str(value)) m = BNode(bnode_hash.hexdigest()) graph.add((m, DQV.isMeasurementOf, metric)) graph.add((m, DQV.value, Literal(value))) # add additional triples graph.add((ds_id, DQV.hasQualityMeasurement, m)) graph.add((m, RDF.type, DQV.QualityMeasurement)) graph.add((m, DQV.computedOn, ds_id)) if fetch_activity: # add prov to each measure quality_prov(m, ds_id, sn_time, fetch_activity, graph)
def get(self, portalid, datasetid): if request.headers.get('Accept-Datetime'): acc_dt = request.headers['Accept-Datetime'] sn = getSnapshotfromTime(parse_rfc1123(acc_dt)) else: sn = getCurrentSnapshot() session = current_app.config['dbsession'] q = session.query(DatasetData) \ .join(Dataset, DatasetData.md5 == Dataset.md5) \ .filter(Dataset.snapshot == sn) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.id == datasetid) data = q.first() p = session.query(Portal).filter(Portal.id == portalid).first() doc = dcat_to_schemadotorg.convert(p, data.raw) timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/schemadotorg>' resp = add_memento_header(jsonify(doc), '<' + doc['@id'] + '>', timegate, sn) resp.headers['Vary'] = 'accept-datetime' d = tofirstdayinisoweek(sn) full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/schemadotorg>' resp.headers['Content-Location'] = full_url return resp
def generate_schemadotorg_files(obj): P, dbConf, snapshot, dir = obj[0], obj[1], obj[2], obj[3] sitemap_urls = [] dbm = DBManager(**dbConf) session = scoped_session(sessionmaker(bind=dbm.engine)) log.info("Start schema.org files", portal=P.id) portal_dir = dir + '/' + P.id if not os.path.exists(portal_dir): os.mkdir(portal_dir) p = session.query(Portal).filter(Portal.id == P.id).first() q = session.query(Dataset) \ .filter(Dataset.snapshot == snapshot) \ .filter(Dataset.portalid == P.id) datasetlist = [] i = 0 for d in q.all(): try: q = session.query(DatasetData) \ .join(Dataset, DatasetData.md5 == Dataset.md5) \ .filter(Dataset.snapshot == snapshot) \ .filter(Dataset.portalid == P.id) \ .filter(Dataset.id == d.id) data = q.first() doc = dcat_to_schemadotorg.convert(p, data.raw) dataset_filename = base64.urlsafe_b64encode(d.id) dataset_file = portal_dir + "/" + dataset_filename create_schemadotorg(doc, dataset_file) if not d.title: t = d.id else: t = d.title datasetlist.append((t, dataset_filename)) dt = data.modified if dt != None and dt < datetime.datetime(year=1980, month=1, day=1): dt = None sitemap_urls.append((dataset_file, dt)) i += 1 if i % 50000 == 0: log.info("Processed datasets", datasets=str(i)) except Exception as exc: ErrorHandler.handleError(log, "CreateSchema.orgFile", exception=exc, pid=P.id, snapshot=snapshot, exc_info=True) create_datasetlist(datasetlist, portal_dir) create_sitemap(sitemap_urls, portal_dir) dt = tofirstdayinisoweek(snapshot) return P, dt, snapshot
def adequate_prov(graph, snapshot): ad_activity = URIRef("http://www.adequate.at/csvprofiler/" + str(snapshot)) graph.add((ad_activity, RDF.type, PROV.Activity)) graph.add((ad_activity, PROV.startedAtTime, Literal(tofirstdayinisoweek(snapshot)))) graph.add( (ad_activity, PROV.endedAtTime, Literal(toLastdayinisoweek(snapshot)))) graph.add((ad_activity, PROV.wasAssociatedWith, AD_AGENT)) return ad_activity
def csvw_prov(graph, snapshot): csvw_activity = URIRef("http://data.wu.ac.at/portalwatch/csvw/" + str(snapshot)) graph.add((csvw_activity, RDF.type, PROV.Activity)) graph.add((csvw_activity, PROV.startedAtTime, Literal(tofirstdayinisoweek(snapshot)))) graph.add((csvw_activity, PROV.endedAtTime, Literal(toLastdayinisoweek(snapshot)))) graph.add((csvw_activity, PROV.wasAssociatedWith, PW_AGENT)) return csvw_activity
def get(self, portalid, datasetid): if request.headers.get('Accept-Datetime'): acc_dt = request.headers['Accept-Datetime'] sn = getSnapshotfromTime(parse_rfc1123(acc_dt)) else: sn = getCurrentSnapshot() resp = get_dataset(portalid, sn, datasetid) resp.headers['Vary'] = 'accept-datetime' d = tofirstdayinisoweek(sn) full_url = HOST + '/' + portalid + '/' + d.strftime("%Y%m%d") + '/' + datasetid resp.headers['Content-Location'] = full_url return resp
def add_memento_header(resp, orig_ref, timegate, snapshot): # timestamp of snapshot dt = tofirstdayinisoweek(snapshot) stamp = mktime(dt.timetuple()) formatted_dt = formatdate( timeval=stamp, localtime=False, usegmt=True ) resp.headers['Memento-Datetime'] = formatted_dt # link to original resource resp.headers['Link'] = orig_ref + '; rel="original", ' + timegate + '; rel="timegate"' return resp
def get_dataset(portalid, snapshot, datasetid): session = current_app.config['dbsession'] q = session.query(DatasetData) \ .join(Dataset, DatasetData.md5 == Dataset.md5) \ .filter(Dataset.snapshot <= snapshot) \ .filter(Dataset.portalid == portalid) \ .filter(Dataset.id == datasetid) \ .order_by(Dataset.snapshot.desc()) data = q.first() if data: resp = jsonify(row2dict(data)) portal = session.query(Portal).filter(Portal.id == portalid).first() g = rdflib.Graph() dataset_ref = add_dcat_to_graph(data.raw, portal, g, None) timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '>' return add_memento_header(resp, dataset_ref.n3(), timegate, snapshot) else: resp = jsonify( {'error': 'There is no version of dataset ' + datasetid + ' available that is older than ' + str(tofirstdayinisoweek(snapshot)), 'portalid': portalid}) resp.status_code = 404 return resp