Exemplo n.º 1
0
    def get(self, portalid, date, datasetid):
        d = parseDate(date)
        sn = getSnapshotfromTime(d)

        session=current_app.config['dbsession']
        p = session.query(Portal).filter(Portal.id == portalid).first()

        q = session.query(Dataset) \
            .filter(Dataset.snapshot <= sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid) \
            .order_by(Dataset.snapshot.desc())
        dataset = q.first()

        if dataset:
            snapshot = dataset.snapshot

            q = session.query(DatasetQuality) \
                .join(Dataset, DatasetQuality.md5 == Dataset.md5) \
                .filter(Dataset.snapshot == snapshot) \
                .filter(Dataset.portalid == portalid) \
                .filter(Dataset.id == datasetid)
            dataset_qual = q.first()

            # get rdf graph and add measures and dimensions
            g, ds_id = dqv_export._get_measures_for_dataset(p, dataset, dataset_qual)
            dqv_export.add_dimensions_and_metrics(g)
            resp = jsonify(json.loads(g.serialize(format="json-ld")))
            timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/dqv>'
            return add_memento_header(resp, ds_id.n3(), timegate, snapshot)
        else:
            return jsonify({'error': 'There is no version of dataset ' + datasetid + ' available that is older than ' + str(d),
                        'portalid': portalid})
Exemplo n.º 2
0
    def get(self, portalid, datasetid):
        if request.headers.get('Accept-Datetime'):
            acc_dt = request.headers['Accept-Datetime']
            sn = getSnapshotfromTime(parse_rfc1123(acc_dt))
        else:
            sn = getCurrentSnapshot()


        session = current_app.config['dbsession']
        p = session.query(Portal).filter(Portal.id == portalid).first()
        q = session.query(DatasetQuality) \
            .join(Dataset, DatasetQuality.md5 == Dataset.md5) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        dataset_qual = q.first()

        q = session.query(Dataset) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        dataset = q.first()
        # get rdf graph and add measures and dimensions
        g, ds_id = dqv_export._get_measures_for_dataset(p, dataset, dataset_qual)
        dqv_export.add_dimensions_and_metrics(g)
        resp = jsonify(json.loads(g.serialize(format="json-ld")))
        timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/dqv>'
        resp = add_memento_header(resp, ds_id.n3(), timegate, sn)

        resp.headers['Vary'] = 'accept-datetime'
        d = tofirstdayinisoweek(sn)
        full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/dqv>'
        resp.headers['Content-Location'] = full_url
        return resp
Exemplo n.º 3
0
    def get(self, portalid, snapshot, datasetid):
        with Timer(key="PortalDatasetDataQuality.get",verbose=True):
            session=current_app.config['dbsession']

            p = session.query(Portal).filter(Portal.id == portalid).first()

            q = session.query(DatasetQuality) \
                .join(Dataset, DatasetQuality.md5 == Dataset.md5) \
                .filter(Dataset.snapshot == snapshot) \
                .filter(Dataset.portalid == portalid) \
                .filter(Dataset.id == datasetid)
            dataset_qual = q.first()

            q=session.query(Dataset)\
                .filter(Dataset.snapshot==snapshot)\
                .filter(Dataset.portalid==portalid)\
                .filter(Dataset.id == datasetid)
            dataset = q.first()
            # get rdf graph and add measures and dimensions
            g = dqv_export.get_measures_for_dataset(p, dataset, dataset_qual)
            dqv_export.add_dimensions_and_metrics(g)
            return jsonify(json.loads(g.serialize(format="json-ld")))
Exemplo n.º 4
0
    def _get_quality(self, args, data, filename):
        try:
            content_type = 'application/json'
            default_url = 'http://missing.portal.url.com'
            portal_url = args.get('portal_url', default_url)
            if not portal_url:
                portal_url = default_url

            default_out = 'json'
            out_format = args.get('format', default_out)
            if not out_format:
                out_format = default_out

            filter_metrics = args.get('metric')

            if 'software' in args:
                software = args['software']

                # stub portal class
                class Portal:
                    def __init__(self, software, uri):
                        self.software = software
                        self.apiuri = uri

                p = Portal(software, portal_url)

                # get rdf graph and add measures and dimensions
                graph = rdflib.Graph()
                # write dcat dataset into graph
                dcat = dataset_converter.dict_to_dcat(data, p, graph=graph)
                measures_g = rdflib.Graph()
                ds_id = graph.value(predicate=RDF.type, object=DCAT.Dataset)
                datasetquality = DatasetQuality(data, dcat)
                metrics_dict = datasetquality.__dict__

                if filter_metrics:
                    metrics_dict = {m: metrics_dict[m] for m in filter_metrics}

                if out_format == 'json':
                    resp = jsonify(metrics_dict)
                elif out_format == 'json-ld':
                    dataset_quality_to_dqv(measures_g, ds_id, datasetquality,
                                           utils_snapshot.getCurrentSnapshot())
                    dqv_export.add_dimensions_and_metrics(measures_g)
                    resp = jsonify(
                        json.loads(measures_g.serialize(format="json-ld")))
                elif out_format == 'csv':
                    outstr = StringIO.StringIO()
                    w = csv.DictWriter(outstr, metrics_dict.keys())
                    w.writeheader()
                    w.writerow(metrics_dict)
                    resp = outstr.getvalue()
                    content_type = 'text/csv'
                else:
                    raise Exception('output format not supported: ' +
                                    out_format)

                filename = secure_filename(filename).split('/')[-1]
                return makeResponse(resp, filename, content_type=content_type)
            else:
                e = 'Portal software parameter required for conversion. ' \
                    '"software" should be "CKAN", "Socrata", or "OpenDataSoft".'
        except Exception as ex:
            e = ex.message

        resp = jsonify({'error': 'Could not parse JSON', 'message': e})
        resp.status_code = 406
        return resp
Exemplo n.º 5
0
def insertDatasets(P, db, iter, snapshot, batch=100, store_local=None):

    log.info("insertDatasets", portalid=P.id, snapshot=snapshot)

    bulk_obj = {'mr': [], 'd': [], 'dq': []}

    c = 0
    for i, d in enumerate(iter):
        c += 1
        with Timer(key='ProcessDataset'):
            #CREATE DATASET AND ADD

            with Timer(key='md5'):
                md5v = None if d.data is None else md5(d.data)

            if md5v:
                with Timer(key='dict_to_dcat'):
                    #analys quality
                    d.dcat = dict_to_dcat(d.data, P)

                DD = None
                DQ = None
                with Timer(key='db.datasetdataExists(md5v)'):
                    process = not db.exist_datasetdata(md5v)
                if process:
                    #DATATSET DATA
                    DD = createDatasetData(md5v, d)
                    try:
                        db.add(DD)  #primary key, needs to be inserted first
                        #DATATSET QUALTIY
                        #print "adding",md5v
                        DQ = createDatasetQuality(P, md5v, d)
                        bulk_obj['dq'].append(DQ)

                        #META RESOURCES
                        MQs = createMetaResources(md5v, d)
                        for MR in MQs:
                            bulk_obj['mr'].append(MR)
                    except Exception as e:
                        pass
                        #print "AND AGAIN",md5v, db.datasetdataExists(md5v)
                #DATATSET
                title = getTitle(d)
                title = title[0] if len(title) > 0 else None

                D = Dataset(
                    id=d.id,
                    snapshot=d.snapshot,
                    portalid=d.portal_id,
                    md5=md5v,
                    organisation=DD.organisation if DD else getOrganization(d),
                    title=title)

                bulk_obj['d'].append(D)

                # store metadata in local git directory
                try:
                    if store_local != None:
                        with Timer(key='store_to_local_git'):
                            if 'name' in d.data:
                                dir_name = d.data['name']
                            else:
                                dir_name = d.id
                            filename = os.path.join(store_local, P.id,
                                                    dir_name)
                            if not os.path.exists(filename):
                                os.makedirs(filename)

                            with open(os.path.join(filename, 'original.json'),
                                      'w') as f:
                                json.dump(d.data, f, indent=4)

                            g = rdflib.Graph()
                            g.parse(data=json.dumps(d.dcat), format='json-ld')
                            dqv_export.general_prov(g)
                            ds_id = g.value(predicate=RDF.type,
                                            object=DCAT.Dataset)
                            if not DQ:
                                DQ = db.datasetqualityExists(md5=md5v)
                            if DQ:
                                dqv_export.add_dimensions_and_metrics(g)
                                dataset_quality_to_dqv(g, ds_id, DQ, snapshot)
                            with open(
                                    os.path.join(filename, 'metadata.jsonld'),
                                    'w') as f:
                                g.serialize(f, format='json-ld')

                except Exception as exc:
                    ErrorHandler.handleError(log,
                                             "StoreToLocalGitException",
                                             exception=exc,
                                             pid=P.id,
                                             dataset=d.id,
                                             snapshot=snapshot,
                                             exc_info=True)
            else:
                D = Dataset(id=d.id,
                            snapshot=d.snapshot,
                            portalid=d.portal_id,
                            md5=md5v,
                            organisation=None)
                bulk_obj['d'].append(D)

        if i % batch == 0:
            bulkInsert(bulk_obj, db)
            for k in bulk_obj:
                bulk_obj[k] = []
        c = i

    #cleanup, commit all left inserts
    bulkInsert(bulk_obj, db)
    for k in bulk_obj:
        bulk_obj[k] = []
    log.info("InsertedDatasets", parsed=c, portalid=P.id, snapshot=snapshot)