def web_definition(): """ Get dataset metadata from a URL Loads the metadata for a file specified by the url and name parameters. Response is the same as for `/dataset/definition` --- parameters: - name: name in: query description: The name of the dataset file that is to be loaded required: false type: string defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab" - name: url in: query description: The URL address of the file that is to be loaded required: false type: string defaultValue: "http://example.com/interesting/file.csv" tags: - Dataverse responses: '200': description: Dataset metadata retrieved schema: $ref: "#/definitions/DatasetSchema" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ dataset_url = request.args.get('url', False) dataset_name = request.args.get('name', False) username = request.args.get('user', False) log.debug("Web dataset: {}/{}".format(dataset_url, dataset_name)) # Check whether a file has been provided if not (dataset_url and dataset_name and username): raise (Exception("""You should provide a file id, name and user""")) response = requests.get(dataset_url) dataset_path = "{}/{}".format(username, dataset_name) if response.status_code == 200: gc.add_file(dataset_path, response.content) dataset_definition = gc.load(dataset_path, dataset_path) return jsonify(dataset_definition) else: raise (Exception( "The dataset with URI {} could not be retrieved from the Web". format(dataset_url))) return jsonify(dataset_definition)
def web_definition(): """ Get dataset metadata from a URL Loads the metadata for a file specified by the url and name parameters. Response is the same as for `/dataset/definition` --- parameters: - name: name in: query description: The name of the dataset file that is to be loaded required: false type: string defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab" - name: url in: query description: The URL address of the file that is to be loaded required: false type: string defaultValue: "http://example.com/interesting/file.csv" tags: - Dataverse responses: '200': description: Dataset metadata retrieved schema: $ref: "#/definitions/DatasetSchema" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ dataset_url = request.args.get('url', False) dataset_name = request.args.get('name', False) username = request.args.get('user', False) log.debug("Web dataset: {}/{}".format(dataset_url, dataset_name)) # Check whether a file has been provided if not(dataset_url and dataset_name and username): raise(Exception("""You should provide a file id, name and user""")) response = requests.get(dataset_url) dataset_path = "{}/{}".format(username, dataset_name) if response.status_code == 200: gc.add_file(dataset_path, response.content) dataset_definition = gc.load(dataset_path, dataset_path) return jsonify(dataset_definition) else: raise(Exception("The dataset with URI {} could not be retrieved from the Web".format(dataset_url))) return jsonify(dataset_definition)
def dataverse_definition(): """ Get dataset metadata from a dataverse file Loads the metadata for a dataverse file specified by the id and name parameters. Response is the same as for `/dataset/definition` --- parameters: - name: name in: query description: The name of the dataset file that is to be loaded required: false type: string defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab" - name: id in: query description: The id of a dataverse dataset file that is to be loaded required: false type: string defaultValue: 2531997 - name: user in: query description: The email address of the Google user id required: true type: string tags: - Dataverse responses: '200': description: Dataset metadata retrieved schema: $ref: "#/definitions/DatasetSchema" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ dataset_id = request.args.get('id', False) dataset_name = request.args.get('name', False) username = request.args.get('user', False) log.debug("Dataverse dataset: {}/{}".format(dataset_id, dataset_name)) # Check whether a file has been provided if not (dataset_id and dataset_name and username): raise (Exception("""You should provide a file id, name and user""")) dataset_url = dc.Connection().get_access_url(dataset_id) log.debug("Dataverse url: {}".format(dataset_url)) response = requests.get(dataset_url) dataset_path = "{}/{}".format(username, dataset_name) log.debug(dataset_path) if response.status_code == 200: gc.add_file(dataset_path, response.content) dataset_definition = gc.load(dataset_path, dataset_path) return jsonify(dataset_definition) else: raise (Exception( "The dataset with URI {} could not be retrieved from Dataverse". format(dataset_url)))
def dataset_submit(): """ Submit the dataset definition to the datalegend Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits the resulting RDF to the datalegend repository --- tags: - Dataset parameters: - name: dataset in: body description: The dataset definition that is to be converted and committed to the datalegend repository required: true schema: type: object properties: dataset: description: The dataset definition $ref: "#/definitions/DatasetSchema" user: description: The Google user profile of the person uploading the dataset type: object responses: '200': description: The dataset was converted succesfully schema: $ref: "#/definitions/Message" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ req_json = request.get_json(force=True) dataset = req_json['dataset'] user = req_json['user'] log.debug("Writing cache to gitlab") gc.write_cache(dataset['file'], {'dataset': dataset}) source_filename = gc.get_local_file_path(dataset['file']) log.debug("Converter will be reading from {}".format(source_filename)) outfile = dataset['file'] + ".nq" target_filename = gc.get_local_file_path(outfile) log.debug("Converter will be writing to {}".format(target_filename)) log.debug("Starting conversion ...") if 'path' in dataset: # TODO: check when there's a path in dataset... where does this happen, and what is it for? log.debug("There's a path in this dataset") c = converter.Converter(dataset, '/tmp/', user, source=dataset['path'], target=target_filename) else: log.debug("There is no path in this dataset, filename is {}".format( source_filename)) c = converter.Converter(dataset, '/tmp/', user, source=source_filename, target=target_filename) c.setProcesses(1) c.convert() log.debug("Conversion successful") with open(target_filename, "rb") as nquads_file: data = nquads_file.read() log.debug("Adding data to gitlab... ") file_info = gc.add_file(outfile, data) log.debug("Added to gitlab: {} ({})".format(file_info['url'], file_info['commit_id'])) log.debug("Parsing dataset... ") g = ConjunctiveGraph() # TODO: This is really inefficient... why are we posting each graph separately? g.parse(data=data, format="nquads") log.debug("DataSet parsed") for graph in g.contexts(): log.debug(g) graph_uri = graph.identifier log.debug("Posting {} ...".format(graph_uri)) sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri) log.debug("... done") return jsonify({ 'code': 200, 'message': 'Succesfully submitted converted data to datalegend', 'url': file_info['url'] })
def dataverse_definition(): """ Get dataset metadata from a dataverse file Loads the metadata for a dataverse file specified by the id and name parameters. Response is the same as for `/dataset/definition` --- parameters: - name: name in: query description: The name of the dataset file that is to be loaded required: false type: string defaultValue: "Mortality.monthly_MadrasIndia.1916_1921.tab" - name: id in: query description: The id of a dataverse dataset file that is to be loaded required: false type: string defaultValue: 2531997 - name: user in: query description: The email address of the Google user id required: true type: string tags: - Dataverse responses: '200': description: Dataset metadata retrieved schema: $ref: "#/definitions/DatasetSchema" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ dataset_id = request.args.get('id', False) dataset_name = request.args.get('name', False) username = request.args.get('user', False) log.debug("Dataverse dataset: {}/{}".format(dataset_id, dataset_name)) # Check whether a file has been provided if not(dataset_id and dataset_name and username): raise(Exception("""You should provide a file id, name and user""")) dataset_url = dc.Connection().get_access_url(dataset_id) log.debug("Dataverse url: {}".format(dataset_url)) response = requests.get(dataset_url) dataset_path = "{}/{}".format(username, dataset_name) log.debug(dataset_path) if response.status_code == 200: gc.add_file(dataset_path, response.content) dataset_definition = gc.load(dataset_path, dataset_path) return jsonify(dataset_definition) else: raise(Exception("The dataset with URI {} could not be retrieved from Dataverse".format(dataset_url)))
def dataset_submit(): """ Submit the dataset definition to the CSDH Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits the resulting RDF to the CSDH repository --- tags: - Dataset parameters: - name: dataset in: body description: The dataset definition that is to be converted and committed to the CSDH repository required: true schema: type: object properties: dataset: description: The dataset definition $ref: "#/definitions/DatasetSchema" user: description: The Google user profile of the person uploading the dataset type: object responses: '200': description: The dataset was converted succesfully schema: $ref: "#/definitions/Message" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ req_json = request.get_json(force=True) dataset = req_json['dataset'] user = req_json['user'] log.debug("Writing cache to gitlab") gc.write_cache(dataset['file'], {'dataset': dataset}) source_filename = gc.get_local_file_path(dataset['file']) log.debug("Converter will be reading from {}".format(source_filename)) outfile = dataset['file'] + ".nq" target_filename = gc.get_local_file_path(outfile) log.debug("Converter will be writing to {}".format(target_filename)) log.debug("Starting conversion ...") if 'path' in dataset: # TODO: check when there's a path in dataset... where does this happen, and what is it for? log.debug("There's a path in this dataset") c = converter.Converter(dataset, '/tmp/', user, source=dataset['path'], target=target_filename) else: log.debug("There is no path in this dataset, filename is {}".format(source_filename)) c = converter.Converter(dataset, '/tmp/', user, source=source_filename, target=target_filename) c.setProcesses(1) c.convert() log.debug("Conversion successful") with open(target_filename, "rb") as nquads_file: data = nquads_file.read() log.debug("Adding data to gitlab... ") file_info = gc.add_file(outfile, data) log.debug("Added to gitlab: {} ({})".format(file_info['url'], file_info['commit_id'])) log.debug("Parsing dataset... ") g = ConjunctiveGraph() # TODO: This is really inefficient... why are we posting each graph separately? g.parse(data=data, format="nquads") log.debug("DataSet parsed") for graph in g.contexts(): log.debug(g) graph_uri = graph.identifier log.debug("Posting {} ...".format(graph_uri)) sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri) log.debug("... done") return jsonify({'code': 200, 'message': 'Succesfully submitted converted data to CSDH', 'url': file_info['url']})