Пример #1
0
def web_definition():
    """
    Get dataset metadata from a URL
    Loads the metadata for a file specified by the url and name parameters.
    Response is the same as for `/dataset/definition`
    ---
      parameters:
        - name: name
          in: query
          description: The name of the dataset file that is to be loaded
          required: false
          type: string
          defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab"
        - name: url
          in: query
          description:
            The URL address of the file that is to be loaded
          required: false
          type: string
          defaultValue: "http://example.com/interesting/file.csv"
      tags:
        - Dataverse
      responses:
        '200':
          description: Dataset metadata retrieved
          schema:
            $ref: "#/definitions/DatasetSchema"
        default:
          description: Unexpected error
          schema:
            $ref: "#/definitions/Message"
    """
    dataset_url = request.args.get('url', False)
    dataset_name = request.args.get('name', False)
    username = request.args.get('user', False)

    log.debug("Web dataset: {}/{}".format(dataset_url, dataset_name))
    # Check whether a file has been provided
    if not (dataset_url and dataset_name and username):
        raise (Exception("""You should provide a file id, name and user"""))

    response = requests.get(dataset_url)

    dataset_path = "{}/{}".format(username, dataset_name)

    if response.status_code == 200:
        gc.add_file(dataset_path, response.content)

        dataset_definition = gc.load(dataset_path, dataset_path)
        return jsonify(dataset_definition)
    else:
        raise (Exception(
            "The dataset with URI {} could not be retrieved from the Web".
            format(dataset_url)))

    return jsonify(dataset_definition)
Пример #2
0
def web_definition():
    """
    Get dataset metadata from a URL
    Loads the metadata for a file specified by the url and name parameters.
    Response is the same as for `/dataset/definition`
    ---
      parameters:
        - name: name
          in: query
          description: The name of the dataset file that is to be loaded
          required: false
          type: string
          defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab"
        - name: url
          in: query
          description:
            The URL address of the file that is to be loaded
          required: false
          type: string
          defaultValue: "http://example.com/interesting/file.csv"
      tags:
        - Dataverse
      responses:
        '200':
          description: Dataset metadata retrieved
          schema:
            $ref: "#/definitions/DatasetSchema"
        default:
          description: Unexpected error
          schema:
            $ref: "#/definitions/Message"
    """
    dataset_url = request.args.get('url', False)
    dataset_name = request.args.get('name', False)
    username = request.args.get('user', False)

    log.debug("Web dataset: {}/{}".format(dataset_url, dataset_name))
    # Check whether a file has been provided
    if not(dataset_url and dataset_name and username):
        raise(Exception("""You should provide a file id, name and user"""))

    response = requests.get(dataset_url)

    dataset_path = "{}/{}".format(username, dataset_name)

    if response.status_code == 200:
        gc.add_file(dataset_path, response.content)

        dataset_definition = gc.load(dataset_path, dataset_path)
        return jsonify(dataset_definition)
    else:
        raise(Exception("The dataset with URI {} could not be retrieved from the Web".format(dataset_url)))

    return jsonify(dataset_definition)
Пример #3
0
def dataverse_definition():
    """
    Get dataset metadata from a dataverse file
    Loads the metadata for a dataverse file specified by the id and name parameters.
    Response is the same as for `/dataset/definition`
    ---
      parameters:
        - name: name
          in: query
          description: The name of the dataset file that is to be loaded
          required: false
          type: string
          defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab"
        - name: id
          in: query
          description:
            The id of a dataverse dataset file that is to be loaded
          required: false
          type: string
          defaultValue: 2531997
        - name: user
          in: query
          description: The email address of the Google user id
          required: true
          type: string
      tags:
        - Dataverse
      responses:
        '200':
          description: Dataset metadata retrieved
          schema:
            $ref: "#/definitions/DatasetSchema"
        default:
          description: Unexpected error
          schema:
            $ref: "#/definitions/Message"
    """
    dataset_id = request.args.get('id', False)
    dataset_name = request.args.get('name', False)
    username = request.args.get('user', False)

    log.debug("Dataverse dataset: {}/{}".format(dataset_id, dataset_name))
    # Check whether a file has been provided
    if not (dataset_id and dataset_name and username):
        raise (Exception("""You should provide a file id, name and user"""))

    dataset_url = dc.Connection().get_access_url(dataset_id)
    log.debug("Dataverse url: {}".format(dataset_url))

    response = requests.get(dataset_url)

    dataset_path = "{}/{}".format(username, dataset_name)

    log.debug(dataset_path)
    if response.status_code == 200:
        gc.add_file(dataset_path, response.content)

        dataset_definition = gc.load(dataset_path, dataset_path)
        return jsonify(dataset_definition)
    else:
        raise (Exception(
            "The dataset with URI {} could not be retrieved from Dataverse".
            format(dataset_url)))
Пример #4
0
def dataset_submit():
    """
    Submit the dataset definition to the datalegend
    Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits
    the resulting RDF to the datalegend repository
    ---
    tags:
        - Dataset
    parameters:
        - name: dataset
          in: body
          description: The dataset definition that is to be converted and committed to the datalegend repository
          required: true
          schema:
            type: object
            properties:
                dataset:
                    description: The dataset definition
                    $ref: "#/definitions/DatasetSchema"
                user:
                    description: The Google user profile of the person uploading the dataset
                    type: object
    responses:
        '200':
            description: The dataset was converted succesfully
            schema:
                $ref: "#/definitions/Message"
        default:
            description: Unexpected error
            schema:
              $ref: "#/definitions/Message"
    """

    req_json = request.get_json(force=True)
    dataset = req_json['dataset']
    user = req_json['user']

    log.debug("Writing cache to gitlab")
    gc.write_cache(dataset['file'], {'dataset': dataset})

    source_filename = gc.get_local_file_path(dataset['file'])
    log.debug("Converter will be reading from {}".format(source_filename))

    outfile = dataset['file'] + ".nq"
    target_filename = gc.get_local_file_path(outfile)
    log.debug("Converter will be writing to {}".format(target_filename))

    log.debug("Starting conversion ...")
    if 'path' in dataset:
        # TODO: check when there's a path in dataset... where does this happen, and what is it for?
        log.debug("There's a path in this dataset")
        c = converter.Converter(dataset,
                                '/tmp/',
                                user,
                                source=dataset['path'],
                                target=target_filename)
    else:
        log.debug("There is no path in this dataset, filename is {}".format(
            source_filename))
        c = converter.Converter(dataset,
                                '/tmp/',
                                user,
                                source=source_filename,
                                target=target_filename)

    c.setProcesses(1)
    c.convert()
    log.debug("Conversion successful")

    with open(target_filename, "rb") as nquads_file:
        data = nquads_file.read()

    log.debug("Adding data to gitlab... ")
    file_info = gc.add_file(outfile, data)
    log.debug("Added to gitlab: {} ({})".format(file_info['url'],
                                                file_info['commit_id']))

    log.debug("Parsing dataset... ")
    g = ConjunctiveGraph()
    # TODO: This is really inefficient... why are we posting each graph separately?
    g.parse(data=data, format="nquads")
    log.debug("DataSet parsed")

    for graph in g.contexts():
        log.debug(g)
        graph_uri = graph.identifier
        log.debug("Posting {} ...".format(graph_uri))
        sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri)
        log.debug("... done")

    return jsonify({
        'code': 200,
        'message': 'Succesfully submitted converted data to datalegend',
        'url': file_info['url']
    })
Пример #5
0
def dataverse_definition():
    """
    Get dataset metadata from a dataverse file
    Loads the metadata for a dataverse file specified by the id and name parameters.
    Response is the same as for `/dataset/definition`
    ---
      parameters:
        - name: name
          in: query
          description: The name of the dataset file that is to be loaded
          required: false
          type: string
          defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab"
        - name: id
          in: query
          description:
            The id of a dataverse dataset file that is to be loaded
          required: false
          type: string
          defaultValue: 2531997
        - name: user
          in: query
          description: The email address of the Google user id
          required: true
          type: string
      tags:
        - Dataverse
      responses:
        '200':
          description: Dataset metadata retrieved
          schema:
            $ref: "#/definitions/DatasetSchema"
        default:
          description: Unexpected error
          schema:
            $ref: "#/definitions/Message"
    """
    dataset_id = request.args.get('id', False)
    dataset_name = request.args.get('name', False)
    username = request.args.get('user', False)

    log.debug("Dataverse dataset: {}/{}".format(dataset_id, dataset_name))
    # Check whether a file has been provided
    if not(dataset_id and dataset_name and username):
        raise(Exception("""You should provide a file id, name and user"""))

    dataset_url = dc.Connection().get_access_url(dataset_id)
    log.debug("Dataverse url: {}".format(dataset_url))

    response = requests.get(dataset_url)

    dataset_path = "{}/{}".format(username, dataset_name)

    log.debug(dataset_path)
    if response.status_code == 200:
        gc.add_file(dataset_path, response.content)

        dataset_definition = gc.load(dataset_path, dataset_path)
        return jsonify(dataset_definition)
    else:
        raise(Exception("The dataset with URI {} could not be retrieved from Dataverse".format(dataset_url)))
Пример #6
0
def dataset_submit():
    """
    Submit the dataset definition to the CSDH
    Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits
    the resulting RDF to the CSDH repository
    ---
    tags:
        - Dataset
    parameters:
        - name: dataset
          in: body
          description: The dataset definition that is to be converted and committed to the CSDH repository
          required: true
          schema:
            type: object
            properties:
                dataset:
                    description: The dataset definition
                    $ref: "#/definitions/DatasetSchema"
                user:
                    description: The Google user profile of the person uploading the dataset
                    type: object
    responses:
        '200':
            description: The dataset was converted succesfully
            schema:
                $ref: "#/definitions/Message"
        default:
            description: Unexpected error
            schema:
              $ref: "#/definitions/Message"
    """

    req_json = request.get_json(force=True)
    dataset = req_json['dataset']
    user = req_json['user']

    log.debug("Writing cache to gitlab")
    gc.write_cache(dataset['file'], {'dataset': dataset})

    source_filename = gc.get_local_file_path(dataset['file'])
    log.debug("Converter will be reading from {}".format(source_filename))

    outfile = dataset['file'] + ".nq"
    target_filename = gc.get_local_file_path(outfile)
    log.debug("Converter will be writing to {}".format(target_filename))

    log.debug("Starting conversion ...")
    if 'path' in dataset:
        # TODO: check when there's a path in dataset... where does this happen, and what is it for?
        log.debug("There's a path in this dataset")
        c = converter.Converter(dataset, '/tmp/', user, source=dataset['path'], target=target_filename)
    else:
        log.debug("There is no path in this dataset, filename is {}".format(source_filename))
        c = converter.Converter(dataset, '/tmp/', user, source=source_filename, target=target_filename)

    c.setProcesses(1)
    c.convert()
    log.debug("Conversion successful")

    with open(target_filename, "rb") as nquads_file:
        data = nquads_file.read()

    log.debug("Adding data to gitlab... ")
    file_info = gc.add_file(outfile, data)
    log.debug("Added to gitlab: {} ({})".format(file_info['url'], file_info['commit_id']))

    log.debug("Parsing dataset... ")
    g = ConjunctiveGraph()
    # TODO: This is really inefficient... why are we posting each graph separately?
    g.parse(data=data, format="nquads")
    log.debug("DataSet parsed")

    for graph in g.contexts():
        log.debug(g)
        graph_uri = graph.identifier
        log.debug("Posting {} ...".format(graph_uri))
        sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri)
        log.debug("... done")

    return jsonify({'code': 200,
                    'message': 'Succesfully submitted converted data to CSDH',
                    'url': file_info['url']})