示例#1
0
def cli(args, dbm):
    sn = getCurrentSnapshot()

    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)
    rdf_dir = args.dir

    if not os.path.exists(rdf_dir):
        os.mkdir(rdf_dir)
    sn_dir = os.path.join(rdf_dir, str(sn))
    if not os.path.exists(sn_dir):
        os.mkdir(sn_dir)

    tasks = []
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id == args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf, sn, sn_dir))
    else:
        for P in db.Session.query(Portal):
            tasks.append((P, dbConf, sn, sn_dir))

    log.info("START FETCH",
             processors=args.processors,
             dbConf=dbConf,
             portals=len(tasks))

    pool = Pool(args.processors)
    for x in pool.imap(streamCSVs, tasks):
        pid, sn = x[0].id, x[1]
        log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
def portalLinkCheckCSV(portalid):
    with Timer(key="get_portalLinkCheckCSV", verbose=True):
        si = StringIO.StringIO()
        cw = csv.writer(si)
        snapshot = getCurrentSnapshot()

        Session = current_app.config['dbsession']
        data = getPortalInfos(Session, portalid, snapshot)
        with Timer(key="query_portalorgas", verbose=True):
            q = Session.query(Dataset.organisation) \
                .filter(Dataset.portalid == portalid) \
                .filter(Dataset.snapshot == snapshot).distinct(Dataset.organisation)

            data['organisations'] = [row2dict(res) for res in q]

        for o in data['organisations']:
            orga = o['organisation']
            #    with Timer(key="query_orga-emails", verbose=True):
            #        portal=Session.query(Portal).filter(Portal.id==portalid).first()
            #        # print('portal: ', portal, 'snapshot: ', snapshot, 'orga: ', orga)
            #        data['contacts'] = contactPerOrga(Session, portal, snapshot, orga)
            #        for cont in data['contacts']:
            linkcheck = 'https://tools.adequate.at' + url_for(
                '.orga_resources',
                portalid=portalid,
                snapshot=snapshot,
                orga=orga)
            cw.writerow([orga, linkcheck])

        output = make_response(si.getvalue())
        output.headers[
            "Content-Disposition"] = "attachment; filename=export.csv"
        output.headers["Content-type"] = "text/csv"
        return output
示例#3
0
def cli(args,dbm):
    if args.snapshot:
        sn = args.snapshot
    else:
        sn = getCurrentSnapshot()

    dbConf= readDBConfFromFile(args.config)
    db= DBClient(dbm)

    tasks=[]
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id==args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf,sn))
    else:
        for P in db.Session.query(Portal):
            tasks.append((P, dbConf,sn))

    log.info("START FRESHNESS", processors=args.processors, dbConf=dbConf, portals=len(tasks))

    pool = Pool(args.processors)
    for x in pool.imap(change_history,tasks):
        pid,sn =x[0].id, x[1]
        log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
示例#4
0
def cli(args, dbm):
    db= DBClient(dbm)

    git = None
    if args.config:
        with open(args.config) as f_conf:
            config = yaml.load(f_conf)
            if 'git' in config:
                git = config['git']

    if not git:
        log.warn("GIT LOCATION OR URL NOT SPECIFIED")
        return

    sn = getCurrentSnapshot()


    if args.portalid:
        P =db.Session.query(Portal).filter(Portal.id==args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            git_update(P, sn, git)
    else:
        for P in db.Session.query(Portal):
            git_update(P, sn, git)
示例#5
0
    def get(self, portalid, datasetid):
        if request.headers.get('Accept-Datetime'):
            acc_dt = request.headers['Accept-Datetime']
            sn = getSnapshotfromTime(parse_rfc1123(acc_dt))
        else:
            sn = getCurrentSnapshot()

        session = current_app.config['dbsession']

        q = session.query(DatasetData) \
            .join(Dataset, DatasetData.md5 == Dataset.md5) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        data = q.first()
        p = session.query(Portal).filter(Portal.id == portalid).first()
        doc = dcat_to_schemadotorg.convert(p, data.raw)
        timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/schemadotorg>'
        resp = add_memento_header(jsonify(doc), '<' + doc['@id'] + '>', timegate, sn)

        resp.headers['Vary'] = 'accept-datetime'
        d = tofirstdayinisoweek(sn)
        full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/schemadotorg>'
        resp.headers['Content-Location'] = full_url
        return resp
示例#6
0
    def get(self, portalid, datasetid):
        if request.headers.get('Accept-Datetime'):
            acc_dt = request.headers['Accept-Datetime']
            sn = getSnapshotfromTime(parse_rfc1123(acc_dt))
        else:
            sn = getCurrentSnapshot()


        session = current_app.config['dbsession']
        p = session.query(Portal).filter(Portal.id == portalid).first()
        q = session.query(DatasetQuality) \
            .join(Dataset, DatasetQuality.md5 == Dataset.md5) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        dataset_qual = q.first()

        q = session.query(Dataset) \
            .filter(Dataset.snapshot == sn) \
            .filter(Dataset.portalid == portalid) \
            .filter(Dataset.id == datasetid)
        dataset = q.first()
        # get rdf graph and add measures and dimensions
        g, ds_id = dqv_export._get_measures_for_dataset(p, dataset, dataset_qual)
        dqv_export.add_dimensions_and_metrics(g)
        resp = jsonify(json.loads(g.serialize(format="json-ld")))
        timegate = '<' + HOST + '/' + portalid + '/' + datasetid + '/dqv>'
        resp = add_memento_header(resp, ds_id.n3(), timegate, sn)

        resp.headers['Vary'] = 'accept-datetime'
        d = tofirstdayinisoweek(sn)
        full_url = '<' + HOST + '/' + portalid + '/' + d.strftime("%y%m%d") + '/' + datasetid + '/dqv>'
        resp.headers['Content-Location'] = full_url
        return resp
示例#7
0
def cli(args, dbm):
    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)
    if not args.sn:
        sn = getCurrentSnapshot()
    else:
        sn = args.sn

    directory = args.directory

    tasks = []
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id == args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf, sn, directory))
    else:
        for P in db.Session.query(Portal):
            tasks.append((P, dbConf, sn, directory))

    log.info("START FETCH",
             processors=args.processors,
             dbConf=dbConf,
             portals=len(tasks))

    portals = []
    pool = Pool(args.processors)
    for x in pool.imap(generate_schemadotorg_files, tasks):
        pid, lastmod, sn = x[0].id, x[1], x[2]
        portals.append((pid, lastmod))
        log.info("RECEIVED RESULT", portalid=pid)

    create_portal_sitemapindex(portals, directory)
示例#8
0
def cli(args, dbm):
    sn = getCurrentSnapshot()

    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)

    aggregateFormatDist(db, sn)
示例#9
0
def portalRes(portalid, snapshot=None):
    if not snapshot:
        snapshot = getCurrentSnapshot()
    Session = current_app.config['dbsession']
    data = {}
    data.update(getPortalInfos(Session, portalid, snapshot))
    return render("odpw_portal_resources.jinja",
                  data=data,
                  snapshot=snapshot,
                  portalid=portalid)
示例#10
0
def cli(args, dbm):
    sn = getCurrentSnapshot()
    db=DBClient(dbm)


    settings=get_project_settings()
    crawler = CrawlerProcess(settings)

    seen=set([])
    crawler.crawl(HeadLookups,snapshot=sn, db=db, batch=args.batch, seen=seen, iso=args.iso, exclude_iso=args.exclude_iso)

    crawler.start()
示例#11
0
def cli(args, dbm):

    datadir = None
    git_location = None
    git_url = None
    if args.config:
        with open(args.config) as f_conf:
            config = yaml.load(f_conf)
            if 'data' in config:
                datadir = config['data']['datadir']
            if 'git' in config and 'datadir' in config['git']:
                git_location = config['git']['datadir']
            if 'git' in config and 'external' in config['git']:
                git_url = config['git']['external']

    if datadir is None:
        log.error("No data dir specified in config", config=args.config)
        return
    log.info("Init datadir", datadir=datadir)
    sn = getCurrentSnapshot()
    api = DBClient(dbm=dbm)

    if args.portal:
        P = api.Session.query(Portal).filter(Portal.id == args.portal).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portal)
            return
        else:
            crawler = CrawlerProcess()
            crawler.crawl(DataMonitorSpider,
                          api=api,
                          datadir=datadir,
                          snapshot=sn,
                          format=args.format,
                          portalID=P.id,
                          git_location=git_location,
                          csvclean=args.clean,
                          git_url=git_url + P.id + '/')
            crawler.start()
    else:
        for P in api.Session.query(Portal):
            log.warn("DOWNLOAD RESOURCES", portalid=P.id)
            crawler = CrawlerProcess()
            crawler.crawl(DataMonitorSpider,
                          api=api,
                          datadir=datadir,
                          snapshot=sn,
                          format=args.format,
                          portalID=P.id,
                          git_location=git_location,
                          csvclean=args.clean,
                          git_url=git_url + P.id + '/')
            crawler.start()
示例#12
0
    def get(self, portalid, datasetid):
        if request.headers.get('Accept-Datetime'):
            acc_dt = request.headers['Accept-Datetime']
            sn = getSnapshotfromTime(parse_rfc1123(acc_dt))
        else:
            sn = getCurrentSnapshot()

        resp = get_dataset(portalid, sn, datasetid)
        resp.headers['Vary'] = 'accept-datetime'
        d = tofirstdayinisoweek(sn)
        full_url = HOST + '/' + portalid + '/' + d.strftime("%Y%m%d") + '/' + datasetid
        resp.headers['Content-Location'] = full_url
        return resp
示例#13
0
def cli(args, dbm):
    sn = getCurrentSnapshot()

    dbConf = readDBConfFromFile(args.config)
    db = DBClient(dbm)

    store_local = None
    if args.config:
        with open(args.config) as f:
            config = yaml.load(f)
            if 'git' in config and 'datadir' in config['git']:
                store_local = config['git']['datadir']

    tasks = []
    if args.portalid:
        P = db.Session.query(Portal).filter(Portal.id == args.portalid).one()
        if P is None:
            log.warn("PORTAL NOT IN DB", portalid=args.portalid)
            return
        else:
            tasks.append((P, dbConf, sn, store_local))
    else:
        if args.repair:
            valid = db.Session.query(PortalSnapshot.portalid).filter(
                PortalSnapshot.snapshot == sn).filter(
                    PortalSnapshot.status == 200).subquery()

            for P in db.Session.query(Portal).filter(Portal.id.notin_(valid)):
                PS = db.Session.query(PortalSnapshot).filter(
                    PortalSnapshot.snapshot == sn).filter(
                        PortalSnapshot.portalid == P.id)
                PS.delete(synchronize_session=False)
                PSQ = db.Session.query(PortalSnapshotQuality).filter(
                    PortalSnapshotQuality.snapshot == sn).filter(
                        PortalSnapshotQuality.portalid == P.id)
                PSQ.delete(synchronize_session=False)
                tasks.append((P, dbConf, sn, store_local))
        else:
            for P in db.Session.query(Portal):
                tasks.append((P, dbConf, sn, store_local))

    log.info("START FETCH",
             processors=args.processors,
             dbConf=dbConf,
             portals=len(tasks))

    pool = Pool(args.processors)
    for x in pool.imap(fetchHttp, tasks):
        pid, sn = x[0].id, x[1]
        log.info("RECEIVED RESULT", portalid=pid, snapshot=sn)
示例#14
0
def cli(args, dbm):

    sn = getCurrentSnapshot()
    db = DBClient(dbm)

    batch = args.batch
    concurrent = args.threads
    delay = args.delay

    log.info("START HEAD", batch=batch, delay=delay, threads=concurrent)

    rsession = requests.Session()
    robots = RobotsManager(rsession)

    q = DomainQueue(args.delay)

    filler = QueueFiller(db, q, robots, batch * 2, sn, concurrent)
    filler.daemon = True
    filler.filling_queue(batch=batch)

    resultQueue = Queue(maxsize=0)
    #start worker threads
    for i in range(concurrent):
        t = Worker(q=q,
                   resultQueue=resultQueue,
                   robots=robots,
                   rsession=rsession,
                   sn=sn)
        t.daemon = True
        t.start()

    filler.start()

    inserter = Inserter(db=db,
                        resultQueue=resultQueue,
                        domainQueue=q,
                        batch=batch / 2)
    inserter.start()

    filler.join()
    inserter.join()
    Timer.printStats()

    import sys
    sys.exit(0)
def portalDynamicLinkCheck(portalid):
    snapshot = getCurrentSnapshot()
    return portalLinkCheck(snapshot, portalid)
示例#16
0
def csv_clean(filename,
              git_url,
              orig_url,
              metadata,
              stream_orig=True,
              max_file_size=10):
    # get the file size in MB
    filesize = os.path.getsize(filename) >> 20
    if filesize > max_file_size:
        return

    # TODO read csv files in dir, run pyyacp and and track modifications, read jsonld, add new resource with description and modifications
    out_encoding = 'utf-8'
    out_delimiter = ','

    reader = yacp.YACParser(filename=filename)
    deli = reader.meta['delimiter']
    encoding = reader.meta['encoding']
    descriptionLines = reader.descriptionLines
    header_line = reader.header_line

    f_name = os.path.basename(filename)

    cleaned_path = os.path.join(os.path.dirname(filename), '..', 'cleaned')
    if not os.path.exists(cleaned_path):
        os.mkdir(cleaned_path)
    cleaned_content = reader.generate(delimiter=out_delimiter, comments=False)

    with codecs.open(os.path.join(cleaned_path, f_name), 'w',
                     out_encoding) as out_f:
        out_f.write(cleaned_content.decode(out_encoding))

    g = rdflib.Graph()
    g.parse(metadata, format="json-ld")

    snapshot = getCurrentSnapshot()
    activity = adequate_prov(g, snapshot)
    if stream_orig:
        try:
            # add csvw info to orig resource
            stream_csv.addMetadata(orig_url,
                                   snapshot,
                                   g,
                                   csvw_activity=activity)
        except Exception as e:
            ErrorHandler.handleError(log,
                                     "GetCSVWMetadata",
                                     exception=e,
                                     url=orig_url,
                                     snapshot=snapshot,
                                     exc_info=True)

    dataset_ref = g.value(predicate=RDF.type, object=DCAT.Dataset)
    repo_name = g.value(subject=dataset_ref, predicate=AD.repositoryName)

    # add new resource
    git_res_page = git_url + str(
        repo_name) + '/' + 'tree/master/cleaned/' + f_name
    git_res_raw = git_url + str(
        repo_name) + '/' + 'raw/master/cleaned/' + f_name

    distribution = URIRef(git_res_page)
    access_url = URIRef(git_res_raw)

    g.add((dataset_ref, DCAT.distribution, distribution))
    g.add((distribution, RDF.type, DCAT.Distribution))
    g.add((distribution, DCAT.accessURL, access_url))

    # prov information
    g.add((access_url, RDF.type, PROV.Entity))
    g.add((access_url, PROV.wasDerivedFrom, URIRef(orig_url)))
    g.add((access_url, PROV.wasGeneratedBy, activity))
    g.add((activity, PROV.generated, access_url))

    # add CSV modifications to metadata
    if not header_line:
        g.add((access_url, AD.csvCleanModification, AD.GenericHeader))
    if deli != out_delimiter:
        g.add((access_url, AD.csvCleanModification, AD.DefaultDelimiter))
    if encoding != out_encoding:
        g.add((access_url, AD.csvCleanModification, AD.Utf8Encoding))
    if descriptionLines:
        g.add((access_url, AD.csvCleanModification, AD.DropCommentLines))
        # add comment lines metadata
        for l in descriptionLines:
            out = StringIO()
            w = csv.writer(out)
            w.writerow([v.encode(out_encoding) for v in l])
            g.add((distribution, RDFS.comment, Literal(out.getvalue())))

    g.serialize(destination=metadata, format='json-ld')
示例#17
0
    def _get_quality(self, args, data, filename):
        try:
            content_type = 'application/json'
            default_url = 'http://missing.portal.url.com'
            portal_url = args.get('portal_url', default_url)
            if not portal_url:
                portal_url = default_url

            default_out = 'json'
            out_format = args.get('format', default_out)
            if not out_format:
                out_format = default_out

            filter_metrics = args.get('metric')

            if 'software' in args:
                software = args['software']

                # stub portal class
                class Portal:
                    def __init__(self, software, uri):
                        self.software = software
                        self.apiuri = uri

                p = Portal(software, portal_url)

                # get rdf graph and add measures and dimensions
                graph = rdflib.Graph()
                # write dcat dataset into graph
                dcat = dataset_converter.dict_to_dcat(data, p, graph=graph)
                measures_g = rdflib.Graph()
                ds_id = graph.value(predicate=RDF.type, object=DCAT.Dataset)
                datasetquality = DatasetQuality(data, dcat)
                metrics_dict = datasetquality.__dict__

                if filter_metrics:
                    metrics_dict = {m: metrics_dict[m] for m in filter_metrics}

                if out_format == 'json':
                    resp = jsonify(metrics_dict)
                elif out_format == 'json-ld':
                    dataset_quality_to_dqv(measures_g, ds_id, datasetquality,
                                           utils_snapshot.getCurrentSnapshot())
                    dqv_export.add_dimensions_and_metrics(measures_g)
                    resp = jsonify(
                        json.loads(measures_g.serialize(format="json-ld")))
                elif out_format == 'csv':
                    outstr = StringIO.StringIO()
                    w = csv.DictWriter(outstr, metrics_dict.keys())
                    w.writeheader()
                    w.writerow(metrics_dict)
                    resp = outstr.getvalue()
                    content_type = 'text/csv'
                else:
                    raise Exception('output format not supported: ' +
                                    out_format)

                filename = secure_filename(filename).split('/')[-1]
                return makeResponse(resp, filename, content_type=content_type)
            else:
                e = 'Portal software parameter required for conversion. ' \
                    '"software" should be "CKAN", "Socrata", or "OpenDataSoft".'
        except Exception as ex:
            e = ex.message

        resp = jsonify({'error': 'Could not parse JSON', 'message': e})
        resp.status_code = 406
        return resp
示例#18
0
def portalDataset(snapshot, portalid, dataset):
    with Timer(key="portalDataset", verbose=True):

        if not snapshot:
            snapshot = getCurrentSnapshot()

        Session = current_app.config['dbsession']
        data = getPortalInfos(Session, portalid, snapshot)
        #data['portals']= [ row2dict(r) for r in Session.query(Portal).all()]
        data.update(getPortalDatasets(Session, portalid, snapshot))

        dd = None
        if dataset:
            for dt in data['datasets']:
                if dt['id'] == dataset:
                    dd = dt
                    break
            with Timer(key="getPortalDatasets_datasetData", verbose=True):
                r = Session.query(DatasetData).join(Dataset).filter(
                    Dataset.id == dataset).join(DatasetQuality).add_entity(
                        DatasetQuality).first()
                data['datasetData'] = row2dict(r)
                software = Session.query(
                    Portal.software).filter(Portal.id == portalid).first()[0]
                if software == 'Socrata':
                    data['json'] = data['datasetData']['raw']['view']
                else:
                    data['json'] = data['datasetData']['raw']
                data['report'] = dataset_reporter.report(r[0],
                                                         r[1],
                                                         software=None)

                #with Timer(key="getSchemadotorgDatasets", verbose=True):
                #    q = Session.query(Portal).filter(Portal.id == portalid)
                #    p = q.first()
                #    schemadotorg = json.dumps(dcat_to_schemadotorg.convert(p, r[0]), indent=3)

            with Timer(key="getPortalDatasets_resources", verbose=True):
                q = Session.query(MetaResource, ResourceInfo).filter(
                    MetaResource.md5 == r[0].md5).outerjoin(
                        ResourceInfo,
                        and_(ResourceInfo.uri == MetaResource.uri,
                             ResourceInfo.snapshot == snapshot))
                data['resources'] = [row2dict(r) for r in q.all()]
                for r in data['resources']:
                    if 'header' in r and isinstance(r['header'], basestring):
                        r['header'] = ast.literal_eval(r['header'])

        with Timer(key="getPortalDatasets_versions", verbose=True):
            q = Session.query(Dataset.md5,
                              func.min(Dataset.snapshot).label('min'),
                              func.max(Dataset.snapshot).label('max')).filter(
                                  Dataset.id == dataset).group_by(Dataset.md5)
            r = [row2dict(r) for r in q.all()]
            print r
            versions = {}
            for i in r:
                a = versions.setdefault(i['md5'], [])
                a.append({'min': i['min'], 'max': i['max']})
            data['versions'] = r

        return render("odpw_portal_dataset.jinja",
                      data=data,
                      snapshot=snapshot,
                      portalid=portalid,
                      dataset=dd,
                      qa=qa,
                      error=errorStatus)