def dataset_save(): """ Save the dataset to the datalegend file cache Note that this does not convert the dataset to RDF, nor does it upload it to the datalegend repository --- tags: - Dataset parameters: - name: dataset in: body description: The dataset definition that is to be saved to cache. required: true type: object schema: $ref: "#/definitions/DatasetSchema" responses: '200': description: The dataset was succesfully saved to the file cache schema: $ref: "#/definitions/Message" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ req_json = request.get_json(force=True) dataset = req_json['dataset'] # dataset_path = os.path.join(config.TEMP_PATH, dataset['file']) gc.write_cache(dataset['file'], {'dataset': dataset}) # fc.write_cache(dataset_path, {'dataset': dataset}) return jsonify({'code': 200, 'message': 'Success'})
def dataset_save(): """ Save the dataset to the CSDH file cache Note that this does not convert the dataset to RDF, nor does it upload it to the CSDH repository --- tags: - Dataset parameters: - name: dataset in: body description: The dataset definition that is to be saved to cache. required: true type: object schema: $ref: "#/definitions/DatasetSchema" responses: '200': description: The dataset was succesfully saved to the file cache schema: $ref: "#/definitions/Message" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ req_json = request.get_json(force=True) dataset = req_json['dataset'] # dataset_path = os.path.join(config.TEMP_PATH, dataset['file']) gc.write_cache(dataset['file'], {'dataset': dataset}) # fc.write_cache(dataset_path, {'dataset': dataset}) return jsonify({'code': 200, 'message': 'Success'})
def dataset_submit(): """ Submit the dataset definition to the datalegend Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits the resulting RDF to the datalegend repository --- tags: - Dataset parameters: - name: dataset in: body description: The dataset definition that is to be converted and committed to the datalegend repository required: true schema: type: object properties: dataset: description: The dataset definition $ref: "#/definitions/DatasetSchema" user: description: The Google user profile of the person uploading the dataset type: object responses: '200': description: The dataset was converted succesfully schema: $ref: "#/definitions/Message" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ req_json = request.get_json(force=True) dataset = req_json['dataset'] user = req_json['user'] log.debug("Writing cache to gitlab") gc.write_cache(dataset['file'], {'dataset': dataset}) source_filename = gc.get_local_file_path(dataset['file']) log.debug("Converter will be reading from {}".format(source_filename)) outfile = dataset['file'] + ".nq" target_filename = gc.get_local_file_path(outfile) log.debug("Converter will be writing to {}".format(target_filename)) log.debug("Starting conversion ...") if 'path' in dataset: # TODO: check when there's a path in dataset... where does this happen, and what is it for? log.debug("There's a path in this dataset") c = converter.Converter(dataset, '/tmp/', user, source=dataset['path'], target=target_filename) else: log.debug("There is no path in this dataset, filename is {}".format( source_filename)) c = converter.Converter(dataset, '/tmp/', user, source=source_filename, target=target_filename) c.setProcesses(1) c.convert() log.debug("Conversion successful") with open(target_filename, "rb") as nquads_file: data = nquads_file.read() log.debug("Adding data to gitlab... ") file_info = gc.add_file(outfile, data) log.debug("Added to gitlab: {} ({})".format(file_info['url'], file_info['commit_id'])) log.debug("Parsing dataset... ") g = ConjunctiveGraph() # TODO: This is really inefficient... why are we posting each graph separately? g.parse(data=data, format="nquads") log.debug("DataSet parsed") for graph in g.contexts(): log.debug(g) graph_uri = graph.identifier log.debug("Posting {} ...".format(graph_uri)) sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri) log.debug("... done") return jsonify({ 'code': 200, 'message': 'Succesfully submitted converted data to datalegend', 'url': file_info['url'] })
def dataset_submit(): """ Submit the dataset definition to the CSDH Uses the DataCube converter to convert the JSON representation of variables to RDF DataCube and commits the resulting RDF to the CSDH repository --- tags: - Dataset parameters: - name: dataset in: body description: The dataset definition that is to be converted and committed to the CSDH repository required: true schema: type: object properties: dataset: description: The dataset definition $ref: "#/definitions/DatasetSchema" user: description: The Google user profile of the person uploading the dataset type: object responses: '200': description: The dataset was converted succesfully schema: $ref: "#/definitions/Message" default: description: Unexpected error schema: $ref: "#/definitions/Message" """ req_json = request.get_json(force=True) dataset = req_json['dataset'] user = req_json['user'] log.debug("Writing cache to gitlab") gc.write_cache(dataset['file'], {'dataset': dataset}) source_filename = gc.get_local_file_path(dataset['file']) log.debug("Converter will be reading from {}".format(source_filename)) outfile = dataset['file'] + ".nq" target_filename = gc.get_local_file_path(outfile) log.debug("Converter will be writing to {}".format(target_filename)) log.debug("Starting conversion ...") if 'path' in dataset: # TODO: check when there's a path in dataset... where does this happen, and what is it for? log.debug("There's a path in this dataset") c = converter.Converter(dataset, '/tmp/', user, source=dataset['path'], target=target_filename) else: log.debug("There is no path in this dataset, filename is {}".format(source_filename)) c = converter.Converter(dataset, '/tmp/', user, source=source_filename, target=target_filename) c.setProcesses(1) c.convert() log.debug("Conversion successful") with open(target_filename, "rb") as nquads_file: data = nquads_file.read() log.debug("Adding data to gitlab... ") file_info = gc.add_file(outfile, data) log.debug("Added to gitlab: {} ({})".format(file_info['url'], file_info['commit_id'])) log.debug("Parsing dataset... ") g = ConjunctiveGraph() # TODO: This is really inefficient... why are we posting each graph separately? g.parse(data=data, format="nquads") log.debug("DataSet parsed") for graph in g.contexts(): log.debug(g) graph_uri = graph.identifier log.debug("Posting {} ...".format(graph_uri)) sc.post_data(graph.serialize(format='turtle'), graph_uri=graph_uri) log.debug("... done") return jsonify({'code': 200, 'message': 'Succesfully submitted converted data to CSDH', 'url': file_info['url']})