def _get_measures_for_dataset(portal, dataset, datasetquality): graph = rdflib.Graph() # write dcat dataset into graph dataset_converter.dict_to_dcat(dataset.data.raw, portal, graph=graph) measures_g = rdflib.Graph() ds_id = graph.value(predicate=RDF.type, object=DCAT.Dataset) dataset_quality_to_dqv(measures_g, ds_id, datasetquality, dataset.snapshot) return measures_g, ds_id
def get(self, portalid,snapshot, datasetid): with Timer(key="PortalDatasetData.get",verbose=True): session=current_app.config['dbsession'] q=session.query(DatasetData) \ .join(Dataset, DatasetData.md5 == Dataset.md5) \ .filter(Dataset.snapshot==snapshot)\ .filter(Dataset.portalid==portalid)\ .filter(Dataset.id == datasetid) data=q.first() P= session.query(Portal).filter(Portal.id==portalid).first() return jsonify(dict_to_dcat(data.raw, P))
def contactPerOrga(Session, portal, snapshot, orga): q = Session.query(Dataset) \ .filter(Dataset.portalid == portal.id) \ .filter(Dataset.snapshot == snapshot) \ .filter(Dataset.organisation == orga) \ .join(DatasetData, DatasetData.md5 == Dataset.md5) \ .join(DatasetQuality, DatasetQuality.md5 == Dataset.md5) \ .add_entity(DatasetData).add_entity(DatasetQuality) pereMail = set([]) for res in q: # Dataset, DatasetData, DatasetQuality ds = row2dict(res) d = portal_fetch_processors.Dataset(snapshot=snapshot, portalID=portal.id, did=ds['id'], data=ds['raw'], status=200, software=portal.software) d.dcat = dict_to_dcat(ds['raw'], portal) contact = getContactPointValues(d) if len(contact) > 1: pereMail.add(contact[1]) return pereMail
def _get_quality(self, args, data, filename): try: content_type = 'application/json' default_url = 'http://missing.portal.url.com' portal_url = args.get('portal_url', default_url) if not portal_url: portal_url = default_url default_out = 'json' out_format = args.get('format', default_out) if not out_format: out_format = default_out filter_metrics = args.get('metric') if 'software' in args: software = args['software'] # stub portal class class Portal: def __init__(self, software, uri): self.software = software self.apiuri = uri p = Portal(software, portal_url) # get rdf graph and add measures and dimensions graph = rdflib.Graph() # write dcat dataset into graph dcat = dataset_converter.dict_to_dcat(data, p, graph=graph) measures_g = rdflib.Graph() ds_id = graph.value(predicate=RDF.type, object=DCAT.Dataset) datasetquality = DatasetQuality(data, dcat) metrics_dict = datasetquality.__dict__ if filter_metrics: metrics_dict = {m: metrics_dict[m] for m in filter_metrics} if out_format == 'json': resp = jsonify(metrics_dict) elif out_format == 'json-ld': dataset_quality_to_dqv(measures_g, ds_id, datasetquality, utils_snapshot.getCurrentSnapshot()) dqv_export.add_dimensions_and_metrics(measures_g) resp = jsonify( json.loads(measures_g.serialize(format="json-ld"))) elif out_format == 'csv': outstr = StringIO.StringIO() w = csv.DictWriter(outstr, metrics_dict.keys()) w.writeheader() w.writerow(metrics_dict) resp = outstr.getvalue() content_type = 'text/csv' else: raise Exception('output format not supported: ' + out_format) filename = secure_filename(filename).split('/')[-1] return makeResponse(resp, filename, content_type=content_type) else: e = 'Portal software parameter required for conversion. ' \ '"software" should be "CKAN", "Socrata", or "OpenDataSoft".' except Exception as ex: e = ex.message resp = jsonify({'error': 'Could not parse JSON', 'message': e}) resp.status_code = 406 return resp
def convert(portal, data): g = rdflib.Graph() # write dcat dataset into graph dataset_converter.dict_to_dcat(data, portal, graph=g) ds_id = g.value(predicate=RDF.type, object=DCAT.Dataset) doc = { "@context": "http://schema.org", "@type": "Dataset", "@id": str(ds_id), "catalog": { "@type": "DataCatalog", "@id": portal.uri, "url": portal.uri, "spatialCoverage": portal.iso, "description": "Underlying software: " + portal.software } } # organization if (ds_id, DCTERMS.publisher, None) in g: pub = {"@type": "Organization"} orga = g.value(ds_id, DCTERMS.publisher) resp_party(g, pub, orga) # contact point if (ds_id, DCAT.contactPoint, None) in g: orga = g.value(ds_id, DCAT.contactPoint) resp_party(g, pub, orga) doc['publisher'] = pub # general fields if (ds_id, DCTERMS.title, None) in g: doc["name"] = str(g.value(ds_id, DCTERMS.title)) if (ds_id, DCTERMS.description, None) in g: doc["description"] = str(g.value(ds_id, DCTERMS.description)) if (ds_id, DCAT.landingPage, None) in g: doc["url"] = str(g.value(ds_id, DCTERMS.landingPage)) if (ds_id, DCTERMS.spatial, None) in g: doc["spatialCoverage"] = str(g.value(ds_id, DCTERMS.spatial)) if (ds_id, DCTERMS.temporal, None) in g: doc["datasetTimeInterval"] = str(g.value(ds_id, DCTERMS.temporal)) if (ds_id, DCAT.theme, None) in g: doc["about"] = str(g.value(ds_id, DCAT.theme)) if (ds_id, DCTERMS.modified, None) in g: doc["dateModified"] = str(g.value(ds_id, DCTERMS.modified)) if (ds_id, DCTERMS.issued, None) in g: doc["datePublished"] = str(g.value(ds_id, DCTERMS.issued)) if (ds_id, DCTERMS.language, None) in g: doc["inLanguage"] = str(g.value(ds_id, DCTERMS.language)) if (ds_id, DCAT.keyword, None) in g: doc["keywords"] = [] for keyword in g.objects(ds_id, DCAT.keyword): doc["keywords"].append(str(keyword)) doc["distribution"] = [] for dist_id in g.objects(ds_id, DCAT.distribution): dist = {"@type": "DataDownload", "@id": str(dist_id)} if (dist_id, DCTERMS.title, None) in g: dist["name"] = str(g.value(dist_id, DCTERMS.title)) if (dist_id, DCTERMS.description, None) in g: dist["description"] = str(g.value(dist_id, DCTERMS.description)) if (dist_id, DCTERMS.modified, None) in g: dist["dateModified"] = str(g.value(dist_id, DCTERMS.modified)) if (dist_id, DCTERMS.issued, None) in g: dist["datePublished"] = str(g.value(dist_id, DCTERMS.issued)) if (dist_id, DCTERMS['format'], None) in g: dist["encodingFormat"] = str(g.value(dist_id, DCTERMS['format'])) if (dist_id, DCAT.byteSize, None) in g: dist["contentSize"] = str(g.value(dist_id, DCAT.byteSize)) if (dist_id, DCAT.mediaType, None) in g: dist["fileFormat"] = str(g.value(dist_id, DCAT.mediaType)) if (dist_id, DCAT.accessURL, None) in g: dist["contentUrl"] = str(g.value(dist_id, DCAT.accessURL)) elif (dist_id, DCAT.downloadURL, None) in g: dist["contentUrl"] = str(g.value(dist_id, DCAT.downloadURL)) if (dist_id, DCTERMS.license, None) in g: l = g.value(dist_id, DCTERMS.license) if isinstance(l, BNode): # look for description if (l, RDFS.label, None) in g: dist["license"] = str(g.value(l, RDFS.label)) elif (l, DCTERMS.identifier, None) in g: dist["license"] = str(g.value(l, DCTERMS.identifier)) else: dist["license"] = str(l) doc["distribution"].append(dist) return doc
def insertDatasets(P, db, iter, snapshot, batch=100, store_local=None): log.info("insertDatasets", portalid=P.id, snapshot=snapshot) bulk_obj = {'mr': [], 'd': [], 'dq': []} c = 0 for i, d in enumerate(iter): c += 1 with Timer(key='ProcessDataset'): #CREATE DATASET AND ADD with Timer(key='md5'): md5v = None if d.data is None else md5(d.data) if md5v: with Timer(key='dict_to_dcat'): #analys quality d.dcat = dict_to_dcat(d.data, P) DD = None DQ = None with Timer(key='db.datasetdataExists(md5v)'): process = not db.exist_datasetdata(md5v) if process: #DATATSET DATA DD = createDatasetData(md5v, d) try: db.add(DD) #primary key, needs to be inserted first #DATATSET QUALTIY #print "adding",md5v DQ = createDatasetQuality(P, md5v, d) bulk_obj['dq'].append(DQ) #META RESOURCES MQs = createMetaResources(md5v, d) for MR in MQs: bulk_obj['mr'].append(MR) except Exception as e: pass #print "AND AGAIN",md5v, db.datasetdataExists(md5v) #DATATSET title = getTitle(d) title = title[0] if len(title) > 0 else None D = Dataset( id=d.id, snapshot=d.snapshot, portalid=d.portal_id, md5=md5v, organisation=DD.organisation if DD else getOrganization(d), title=title) bulk_obj['d'].append(D) # store metadata in local git directory try: if store_local != None: with Timer(key='store_to_local_git'): if 'name' in d.data: dir_name = d.data['name'] else: dir_name = d.id filename = os.path.join(store_local, P.id, dir_name) if not os.path.exists(filename): os.makedirs(filename) with open(os.path.join(filename, 'original.json'), 'w') as f: json.dump(d.data, f, indent=4) g = rdflib.Graph() g.parse(data=json.dumps(d.dcat), format='json-ld') dqv_export.general_prov(g) ds_id = g.value(predicate=RDF.type, object=DCAT.Dataset) if not DQ: DQ = db.datasetqualityExists(md5=md5v) if DQ: dqv_export.add_dimensions_and_metrics(g) dataset_quality_to_dqv(g, ds_id, DQ, snapshot) with open( os.path.join(filename, 'metadata.jsonld'), 'w') as f: g.serialize(f, format='json-ld') except Exception as exc: ErrorHandler.handleError(log, "StoreToLocalGitException", exception=exc, pid=P.id, dataset=d.id, snapshot=snapshot, exc_info=True) else: D = Dataset(id=d.id, snapshot=d.snapshot, portalid=d.portal_id, md5=md5v, organisation=None) bulk_obj['d'].append(D) if i % batch == 0: bulkInsert(bulk_obj, db) for k in bulk_obj: bulk_obj[k] = [] c = i #cleanup, commit all left inserts bulkInsert(bulk_obj, db) for k in bulk_obj: bulk_obj[k] = [] log.info("InsertedDatasets", parsed=c, portalid=P.id, snapshot=snapshot)
def orgaReport(Session, portal, snapshot, orga, contact=None): with Timer(key=orga, verbose=True): q = Session.query(Dataset) \ .filter(Dataset.portalid == portal.id) \ .filter(Dataset.snapshot == snapshot) \ .filter(Dataset.organisation == orga) \ .join(DatasetData, DatasetData.md5 == Dataset.md5) \ .join(DatasetQuality, DatasetQuality.md5 == Dataset.md5) \ .add_entity(DatasetData).add_entity(DatasetQuality) pereMail = {} uris = set([]) summary = {'status': defaultdict(int)} summary['status'][200] = 0 summary['status'][404] = 0 summary['status']['total'] = 0 for res in q: #Dataset, DatasetData, DatasetQuality ds = {} ds['dataset'] = row2dict(res[0]) #ds['dataset']['external_uri']=portal.apiuri+"/katalog/dataset" ds['data'] = row2dict(res[1]) ds['quality'] = row2dict(res[2]) d = portal_fetch_processors.Dataset(snapshot=snapshot, portalID=portal.id, did=ds['dataset']['id'], data=ds['data']['raw'], status=200, software=portal.software) d.dcat = dict_to_dcat(ds['data']['raw'], portal) contactInfo = getContactPointValues(d) if len(contactInfo) > 1: if contact is not None and contact != contactInfo[1]: continue ds['report'] = dataset_reporter.report(res[1], res[2], portal.software) orgas = pereMail.setdefault(contactInfo[1], {}) ds_list = orgas.setdefault(orga, []) ds_list.append(ds) ds['resourcesStatus'] = defaultdict(int) ds['resourcesStatus']['total'] = 0 ds['resources'] = [row2dict(r) for r in res[1].resources] for resou in ds['resources']: resri = Session.query(ResourceInfo).filter( ResourceInfo.uri == resou['uri']).filter( ResourceInfo.snapshot == snapshot).first() if resri is not None: resou['info'] = row2dict(resri) ds['resourcesStatus'][resou['info']['status']] += 1 ds['resourcesStatus']['total'] += 1 if resou['uri'] not in uris: summary['status'][resou['info']['status']] += 1 summary['status']['total'] += 1 if resou['uri'] not in uris: uris.add(resou['uri']) ds['resourcesStatus'] = dict(ds['resourcesStatus']) ContactCount = 0 #print " Organisation:", orga for k, v in pereMail.items(): print " contact:", k for orga, ds_list in v.items(): print " ", orga, len(ds_list) ContactCount += len(ds_list) for ds in ds_list: print " >", ds['report'] summary['status'] = dict(summary['status']) pereMail['summary'] = summary pereMail['summary']['totaluris'] = len(uris) return pereMail