示例#1
0
def dataset_save():
    """
    Save the dataset to the datalegend file cache
    Note that this does not convert the dataset to RDF, nor does it upload it to the datalegend repository
    ---
    tags:
        - Dataset
    parameters:
        - name: dataset
          in: body
          description: The dataset definition that is to be saved to cache.
          required: true
          type: object
          schema:
            $ref: "#/definitions/DatasetSchema"
    responses:
        '200':
            description: The dataset was succesfully saved to the file cache
            schema:
                $ref: "#/definitions/Message"
        default:
            description: Unexpected error
            schema:
              $ref: "#/definitions/Message"
    """
    req_json = request.get_json(force=True)

    dataset = req_json['dataset']
    # dataset_path = os.path.join(config.TEMP_PATH, dataset['file'])

    gc.write_cache(dataset['file'], {'dataset': dataset})

    # fc.write_cache(dataset_path, {'dataset': dataset})
    return jsonify({'code': 200, 'message': 'Success'})
示例#2
0
def dataset_save():
    """
    Save the dataset to the CSDH file cache
    Note that this does not convert the dataset to RDF, nor does it upload it to the CSDH repository
    ---
    tags:
        - Dataset
    parameters:
        - name: dataset
          in: body
          description: The dataset definition that is to be saved to cache.
          required: true
          type: object
          schema:
            $ref: "#/definitions/DatasetSchema"
    responses:
        '200':
            description: The dataset was succesfully saved to the file cache
            schema:
                $ref: "#/definitions/Message"
        default:
            description: Unexpected error
            schema:
              $ref: "#/definitions/Message"
    """
    req_json = request.get_json(force=True)

    dataset = req_json['dataset']
    # dataset_path = os.path.join(config.TEMP_PATH, dataset['file'])

    gc.write_cache(dataset['file'], {'dataset': dataset})

    # fc.write_cache(dataset_path, {'dataset': dataset})
    return jsonify({'code': 200, 'message': 'Success'})
示例#3
0
def dataset_submit():
    """
    Submit the dataset definition to the datalegend
    Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits
    the resulting RDF to the datalegend repository
    ---
    tags:
        - Dataset
    parameters:
        - name: dataset
          in: body
          description: The dataset definition that is to be converted and committed to the datalegend repository
          required: true
          schema:
            type: object
            properties:
                dataset:
                    description: The dataset definition
                    $ref: "#/definitions/DatasetSchema"
                user:
                    description: The Google user profile of the person uploading the dataset
                    type: object
    responses:
        '200':
            description: The dataset was converted succesfully
            schema:
                $ref: "#/definitions/Message"
        default:
            description: Unexpected error
            schema:
              $ref: "#/definitions/Message"
    """

    req_json = request.get_json(force=True)
    dataset = req_json['dataset']
    user = req_json['user']

    log.debug("Writing cache to gitlab")
    gc.write_cache(dataset['file'], {'dataset': dataset})

    source_filename = gc.get_local_file_path(dataset['file'])
    log.debug("Converter will be reading from {}".format(source_filename))

    outfile = dataset['file'] + ".nq"
    target_filename = gc.get_local_file_path(outfile)
    log.debug("Converter will be writing to {}".format(target_filename))

    log.debug("Starting conversion ...")
    if 'path' in dataset:
        # TODO: check when there's a path in dataset... where does this happen, and what is it for?
        log.debug("There's a path in this dataset")
        c = converter.Converter(dataset,
                                '/tmp/',
                                user,
                                source=dataset['path'],
                                target=target_filename)
    else:
        log.debug("There is no path in this dataset, filename is {}".format(
            source_filename))
        c = converter.Converter(dataset,
                                '/tmp/',
                                user,
                                source=source_filename,
                                target=target_filename)

    c.setProcesses(1)
    c.convert()
    log.debug("Conversion successful")

    with open(target_filename, "rb") as nquads_file:
        data = nquads_file.read()

    log.debug("Adding data to gitlab... ")
    file_info = gc.add_file(outfile, data)
    log.debug("Added to gitlab: {} ({})".format(file_info['url'],
                                                file_info['commit_id']))

    log.debug("Parsing dataset... ")
    g = ConjunctiveGraph()
    # TODO: This is really inefficient... why are we posting each graph separately?
    g.parse(data=data, format="nquads")
    log.debug("DataSet parsed")

    for graph in g.contexts():
        log.debug(g)
        graph_uri = graph.identifier
        log.debug("Posting {} ...".format(graph_uri))
        sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri)
        log.debug("... done")

    return jsonify({
        'code': 200,
        'message': 'Succesfully submitted converted data to datalegend',
        'url': file_info['url']
    })
示例#4
0
def dataset_submit():
    """
    Submit the dataset definition to the CSDH
    Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits
    the resulting RDF to the CSDH repository
    ---
    tags:
        - Dataset
    parameters:
        - name: dataset
          in: body
          description: The dataset definition that is to be converted and committed to the CSDH repository
          required: true
          schema:
            type: object
            properties:
                dataset:
                    description: The dataset definition
                    $ref: "#/definitions/DatasetSchema"
                user:
                    description: The Google user profile of the person uploading the dataset
                    type: object
    responses:
        '200':
            description: The dataset was converted succesfully
            schema:
                $ref: "#/definitions/Message"
        default:
            description: Unexpected error
            schema:
              $ref: "#/definitions/Message"
    """

    req_json = request.get_json(force=True)
    dataset = req_json['dataset']
    user = req_json['user']

    log.debug("Writing cache to gitlab")
    gc.write_cache(dataset['file'], {'dataset': dataset})

    source_filename = gc.get_local_file_path(dataset['file'])
    log.debug("Converter will be reading from {}".format(source_filename))

    outfile = dataset['file'] + ".nq"
    target_filename = gc.get_local_file_path(outfile)
    log.debug("Converter will be writing to {}".format(target_filename))

    log.debug("Starting conversion ...")
    if 'path' in dataset:
        # TODO: check when there's a path in dataset... where does this happen, and what is it for?
        log.debug("There's a path in this dataset")
        c = converter.Converter(dataset, '/tmp/', user, source=dataset['path'], target=target_filename)
    else:
        log.debug("There is no path in this dataset, filename is {}".format(source_filename))
        c = converter.Converter(dataset, '/tmp/', user, source=source_filename, target=target_filename)

    c.setProcesses(1)
    c.convert()
    log.debug("Conversion successful")

    with open(target_filename, "rb") as nquads_file:
        data = nquads_file.read()

    log.debug("Adding data to gitlab... ")
    file_info = gc.add_file(outfile, data)
    log.debug("Added to gitlab: {} ({})".format(file_info['url'], file_info['commit_id']))

    log.debug("Parsing dataset... ")
    g = ConjunctiveGraph()
    # TODO: This is really inefficient... why are we posting each graph separately?
    g.parse(data=data, format="nquads")
    log.debug("DataSet parsed")

    for graph in g.contexts():
        log.debug(g)
        graph_uri = graph.identifier
        log.debug("Posting {} ...".format(graph_uri))
        sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri)
        log.debug("... done")

    return jsonify({'code': 200,
                    'message': 'Succesfully submitted converted data to CSDH',
                    'url': file_info['url']})