def validate(resource=None): datapackage.validate('datapackage/gtex-v8-datapackage.json') gtex_package = datapackage.DataPackage( 'datapackage/gtex-v8-datapackage.json', strict=True) if resource != 'all': r = gtex_package.get_resource(resource) print(r.name) try: t = r.read() except CastError as ce: print('Hit cast error') for err in ce.errors: print(err) print(ce) except Exception as inst: print('Hit generic exception') print(type(inst)) print(inst.args) print(inst) else: for r in gtex_package.resources: print(r.name) try: t = r.read() except CastError as ce: print('Hit cast error') print(ce.errors) print(ce) except Exception as inst: print('Hit generic exception') print(type(inst)) print(inst.args) print(inst)
def validate(descriptor): try: datapackage.validate(descriptor) click.echo('Data package descriptor is valid') except datapackage.exceptions.ValidationError as exception: click.echo('Data package descriptor is invalid') for error in exception.errors: click.echo(error) exit(1)
def process_input(infile, validate=False, debug=False): dp_json = infile.readline().strip() if dp_json == '': sys.exit(-3) dp = json.loads(dp_json) resources = dp.get('resources', []) original_resources = copy.deepcopy(resources) if len(dp.get('resources', [])) == 0: # Currently datapackages with no resources are disallowed in the schema. # Since this might happen in the early stages of a pipeline, # we're adding this hack to avoid validation errors dp_to_validate = copy.deepcopy(dp) dp_to_validate['resources'] = [{ 'name': '__placeholder__', 'path': PATH_PLACEHOLDER }] else: dp_to_validate = dp try: datapackage.validate(dp_to_validate) except ValidationError as e: logging.info('ABOUT TO VALIDATE %r', dp_to_validate) for e in e.errors: try: logging.error( "Data Package validation error: %s at dp%s", e.message, "[%s]" % "][".join(repr(index) for index in e.path)) except: logging.error("Data Package validation error: %s", e) raise infile.readline().strip() def resources_iterator(_resources, _original_resources): # we pass a resource instance that may be changed by the processing # code, so we must keep a copy of the original resource (used to # validate incoming data) ret = [] for resource, orig_resource in zip(_resources, _original_resources): if not streaming(resource): continue res_iter = ResourceIterator(infile, resource, orig_resource, validate, debug) ret.append(res_iter) return iter(ret) return dp, resources_iterator(resources, original_resources)
def test_validate_invalid(): with pytest.raises(exceptions.ValidationError) as excinfo: validate({}) assert len(excinfo.value.errors) == 1 assert 'resources' in str(excinfo.value.errors[0])
def test_validate_valid(): valid = validate('data/datapackage/datapackage.json') assert valid
#!/usr/bin/env python3 ################################################################################### # Please install tableschema before running: https://pypi.org/project/tableschema/ ################################################################################### import tableschema from datapackage import Package, Resource, validate, exceptions c2m2_schema = '004_HMP__C2M2_preload__preBag_output_files/datapackage.json' try: valid = validate(c2m2_schema) except exceptions.ValidationError as exception: for error in exception.errors: print(error)
def ingest(config_file): """Ingest a datapackage""" # Read the config file telling you what to do config = initialize(config_file) # Inspect the datapackage dp = datapackage.DataPackage(cfg['ingest'].get('datapackage', None)) if (dp.errors): for error in dp.errors: logging.error(error) raise Exception('Invalid data package') # Validate the Datapackage try: valid = datapackage.validate(dp.descriptor) except exceptions.ValidationError as exception: for error in datapackage.exception.errors: logging.error(error) raise Exception('Invalid data package') # Generate datasetId datasetId = generateDatasetId(datapackage) logging.info('Dataset ID: %s' % (datasetId)) # execute store_type = cfg.get('store', None) if store_type is None: raise Exception('The configuration does not define an ingest store') module = __import__('oceanproteinportal.store') store_ = getattr(module, store_type) store = store_() if cfg['ingest'].get('load-dataset-metadata', False): logging.info('***** LOADING DATASET METADATA *****') store.loadDatasetMetadata(datapackage=dp, datasetId=datasetId) if cfg['ingest'].get('load-protein-data', False): protein_row_start = cfg['ingest'].get('protein-load-row-start', 0) protein_row_stop = cfg['ingest'].get('protein-load-row-stop', None) logging.info('***** LOADING PROTEINS (row=%s, %s) *****' % (protein_row_start, protein_row_stop)) store.loadProteins(datapackage=dp, datasetId=datasetId, row_start=protein_row_start, row_stop=protein_row_stop) if cfg['ingest'].get('calculate-dataset-metadata-stats', False): logging.info('***** UPDATING DATASET Sample STATS *****') store.updateDatasetSampleStats(datasetId=datasetId) if cfg['ingest'].get('load-fasta', False): logging.info('***** LOAD PROTEIN FASTA *****') store.loadProteinsFASTA(datapackage=dp, datasetId=datasetId) if cfg['ingest'].get('load-peptide-data', False): peptide_row_start = cfg['ingest'].get('peptide-load-row-start', 0) peptide_row_stop = cfg['ingest'].get('peptide-load-row-stop', None) logging.info('***** LOADING PEPTIDES (row=%s, %s) *****' % (peptide_row_start, peptide_row_stop)) store.loadPeptide(datapackage=dp, datasetId=datasetId, row_start=peptide_row_start, row_stop=peptide_row_stop) if cfg['ingest'].get('add-peptides-to-proteins', False): storeupdateProteinsWithPeptide(datapackage=dp, datasetId=datasetId)
from datapackage import Package from datapackage import exceptions import os from datapackage import validate, exceptions ROOT = '/home/pgi/dev/toflit18_data/scripts/' SKIP_RESOURCES = [] p = Package(os.path.join(ROOT, 'datapackage.json'), ROOT) if not p.valid: for error in p.errors: print(error) try: valid = validate(p.descriptor) print("valid?: %s" % valid) except exceptions.ValidationError as exception: for error in exception.errors: # handle individual error print(error) for resource in p.resources: print(resource.name) if not resource.valid: for error in resource.errors: print(error) try: print("%s relations" % resource.name) errors = resource.read() resource.check_relations() # relations are kept in the resource object => memory leak