def test_load_invalid_manifest(self): """ test an invalid manifest file """ invalid_dir = os_path.join(_TEST_DIR, 'invalid_manifest') error_list = [ { # no file list provided 'file': 'no_file_list', 'msg': "'file_list' is a required property", }, { # a cluster file entry should have a prefix 'file': 'cluster_no_prefix', 'msg': r"{'data_type': 'cluster', 'path': 'I2_named.tsv'} is not valid under any of the given schemas", }, { # each file_list entry has to have a path 'file': 'missing_path', 'msg': "'path' is a required property", }, { # if the date is not quoted, pyyaml will turn it into a date object. Doh! 'file': 'date_not_in_quotes', 'msg': "datetime.date\(2020, 12, 25\) is not of type 'string'", }, { # file format is invalid 'file': 'invalid_format', 'msg': "'txt' is not one of \['tsv', 'csv'\]" }, { # there must be an indicator of file format 'file': 'no_file_format', 'msg': r"{'data_type': 'edge', 'date': '2020-12-25', 'path': 'edge_data'}" + " is not valid under any of the given schemas", }, ] for entry in error_list: data_file = os_path.join(invalid_dir, entry['file'] + '.yaml') print('looking at ' + data_file) with self.assertRaisesRegex(ValidationError, entry['msg']): run_validator( schema_file=schema_file, data_file=data_file, nicer_errors=True )
def _get_manifest(self, configuration): """ Read the manifest file, which contains path and file type info, and validate it. The manifest is expected to be at ROOT_DATA_PATH/manifest.yaml """ schema_file = self._get_manifest_schema_file() # load the manifest and validate it against the schema manifest_file = os.path.join(configuration['ROOT_DATA_PATH'], 'manifest.yaml') try: with open(manifest_file) as fd: manifest = yaml.safe_load(fd) except FileNotFoundError: raise RuntimeError( f"No manifest file found at {manifest_file}.\n" "Please ensure that you have created a manifest that lists the files " "in the release") try: validated_manifest = run_validator(schema_file=schema_file, data=manifest) except Exception as err: print(err) raise RuntimeError( "The manifest file failed validation. Please recheck the file and try again." ) return validated_manifest
def test_date_format_validation(self, schema_arg=None, schema_file_arg=None): '''ensure that fancy date formats are correctly validated''' # skip if the test is not being called from test_json_validation if schema_arg is None and schema_file_arg is None: self.assertTrue(True) return tests = [{ 'input': { 'date': '20200606' }, 'file': 'invalid_date', 'err_str': "'20200606' is not a 'date'", }, { 'input': { 'date': 20200606 }, 'file': 'invalid_date_type', 'err_str': "20200606 is not of type 'string'" }, { 'input': { "name": "valid_date", "date": "2020-06-06", "distance": 3 }, 'file': 'valid_date', 'output': { **schema_defaults, "name": "valid_date", "date": "2020-06-06", "distance": 3, } }] self.execute_tests(schema_arg, schema_file_arg, tests) # pyyaml-specific issue: dates get automatically parsed into datetime objects (doh!) file_path = os_path.join(json_validation_dir, 'unquoted_date.yaml') err_str = "datetime.date\(2020, 6, 6\) is not of type 'string'" with self.assertRaisesRegex(ValidationError, err_str): run_validator(schema=schema_arg, schema_file=schema_file_arg, data_file=file_path, validate_at=valid_json_loc)
def validate_data_source(path): print(f' validating {path}..') # JSON schema for data source files in /data_sources data_source_schema_file = _VALID_SCHEMA_TYPES['data_source']['file'] data = run_validator(schema_file=data_source_schema_file, data_file=path) namecheck_schema(path, data) print(f'✓ {path} is valid.') return data
def validate_view(path): """Validate the structure and syntax of an arangodb view""" print(f' validating {path}..') # JSON schema for /views view_schema_file = _VALID_SCHEMA_TYPES['view']['file'] data = run_validator(data_file=path, schema_file=view_schema_file) namecheck_schema(path, data) print(f'✓ {path} is valid.') return data
def validate_stored_query(path): print(f' validating {path}..') stored_queries_schema_file = _VALID_SCHEMA_TYPES['stored_query']['file'] data = run_validator(schema_file=stored_queries_schema_file, data_file=path) namecheck_schema(path, data) # Make sure `params` can be used as a JSON schema if data.get('params'): # If the schema is invalid, a SchemaError will get raised # Otherwise, the schema will work and a ValidationError will get raised try: run_validator(data={}, schema=data['params']) except ValidationError: pass # check that the query is valid AQL validate_aql_on_arango(data) print(f'✓ {path} is valid.') return data
def test_complex_schema_references(self): """test validation with complex references that reference other references""" valid_data = { 'node': { 'id': 'TAIR:19830', 'type': 'gene', }, 'edge': valid_edge_data, 'marks_out_of_ten': 5 } invalid_data = { 'node': { 'id': 'TAIR:19830', 'type': 'gene', }, 'edge': invalid_edge_data, 'marks_out_of_ten': 5 } err_msg = "'whatever' is not valid under any of the given schemas" for file_ext in ['json', 'yaml']: with self.subTest(file_ext=file_ext): file_path = os_path.join( *(test_data_dirs + ['schema_refs', 'level_1']), 'test_object.' + file_ext) # data fails validation with self.assertRaisesRegex(ValidationError, err_msg): run_validator( schema_file=file_path, data=invalid_data, ) self.assertEqual( run_validator( schema_file=file_path, data=valid_data, ), valid_data)
def validate_collection(path): print(f' validating {path}..') # JSON schema for vertex and edge collection schemas found in /schema collection_schema_file = _VALID_SCHEMA_TYPES['collection']['file'] data = run_validator(schema_file=collection_schema_file, data_file=path) namecheck_schema(path, data) # Make sure it can be used as a JSON schema # If the schema is invalid, a SchemaError will get raised # Otherwise, the schema will work and a ValidationError will get raised (what we want) try: run_validator(data={}, schema=data['schema']) except ValidationError: pass except Exception as err: print('=' * 80) print('Unable to load schema in ' + path) raise err required = data['schema'].get('required', []) # Edges must require _from and _to while vertices must require _key has_edge_fields = ('_from' in required and '_to' in required) has_delta_edge_fields = ('from' in required and 'to' in required) if data['type'] == 'edge' and data.get('delta') and not has_delta_edge_fields: raise ValidationError('Time-travel edge schemas must require "from" and "to" attributes in ' + path) elif data['type'] == 'edge' and not data.get('delta') and not has_edge_fields: raise ValidationError('Edge schemas must require "_from" and "_to" attributes in ' + path) elif data['type'] == 'vertex' and data.get('delta') and 'id' not in required: raise ValidationError('Time-travel vertex schemas must require the "id" attribute in ' + path) elif data['type'] == 'vertex' and not data.get('delta') and '_key' not in required: raise ValidationError('Vertex schemas must require the "_key" attribute in ' + path) print(f'✓ {path} is valid.') return data
def execute_tests(self, schema_arg, schema_file_arg, tests, file_types=[None, 'json', 'yaml']): for t in tests: for file_ext in file_types: data = t['input'] data_file = os_path.join(json_validation_dir, f"{t['file']}.{file_ext}") if file_ext is None: data_file = None else: data = None with self.subTest(input=t['input'], file_type=file_ext): if 'err_str' in t: with self.assertRaisesRegex(ValidationError, t['err_str']): run_validator(schema=schema_arg, schema_file=schema_file_arg, data=data, data_file=data_file, validate_at=valid_json_loc) else: output = run_validator(schema=schema_arg, schema_file=schema_file_arg, data=data, data_file=data_file, validate_at=valid_json_loc) self.assertEqual(output, { **schema_defaults, **t['output'] })
def test_load_valid_manifests(self): valid_dir = os_path.join(_TEST_DIR, 'valid_manifest') file_list = ['with_descriptions', 'no_file_ext', 'no_file_format'] for file in file_list: data_file = os_path.join(valid_dir, file + '.yaml') print('looking at ' + data_file) self.assertTrue( run_validator( schema_file=schema_file, data_file=data_file, nicer_errors=True ) )
def test_schema_references(self): """Ensure referenced schemas, including those written in yaml, can be accessed.""" # same schema in different places path_list = [[], ['level_1'], ['level_1', 'level_2']] err_msg = "'whatever' is not valid under any of the given schemas" for path in path_list: for file_ext in ['json', 'yaml']: with self.subTest(file_ext=file_ext): file_path = os_path.join( *(test_data_dirs + ['schema_refs'] + path), 'edge.' + file_ext) # fails due to invalid data with self.assertRaisesRegex(ValidationError, err_msg): run_validator( schema_file=file_path, data=invalid_edge_data, ) # valid data self.assertEqual( run_validator( schema_file=file_path, data=valid_edge_data, ), valid_edge_data) # validate using the schema instead of the schema_file with open(file_path) as fd: contents = yaml.safe_load( fd) if file_ext == 'yaml' else json.load(fd) # if there is no $id in the schema, the ref resolver won't know # where the schema file is located and will not resolve relative references with self.assertRaisesRegex(RefResolutionError, 'No such file or directory'): run_validator(schema=contents, data=valid_edge_data) # inject an $id with the current file path contents['$id'] = file_path self.assertEqual( run_validator( schema=contents, data=valid_edge_data, ), valid_edge_data)
def run_query(): """ Run a stored query as a query against the database. Auth: - only kbase re admins for ad-hoc queries - public stored queries (these have access controls within them based on params) """ json_body = parse_json.get_json_body() or {} # fetch number of documents to return batch_size = int(flask.request.args.get('batch_size', 10000)) full_count = flask.request.args.get('full_count', False) if 'query' in json_body: # Run an adhoc query for a sysadmin auth.require_auth_token(roles=['RE_ADMIN']) query_text = _preprocess_stored_query(json_body['query'], json_body) del json_body['query'] if 'ws_ids' in query_text: # Fetch any authorized workspace IDs using a KBase auth token, if present auth_token = auth.get_auth_header() json_body['ws_ids'] = auth.get_workspace_ids(auth_token) resp_body = arango_client.run_query(query_text=query_text, bind_vars=json_body, batch_size=batch_size, full_count=full_count) return flask.jsonify(resp_body) if 'stored_query' in flask.request.args or 'view' in flask.request.args: # Run a query from a query name # Note: we are maintaining backwards compatibility here with the "view" arg. # "stored_query" is the more accurate name query_name = flask.request.args.get( 'stored_query') or flask.request.args.get('view') stored_query = spec_loader.get_stored_query(query_name) if 'params' in stored_query: # Validate the user params for the query stored_query_path = spec_loader.get_stored_query(query_name, path_only=True) run_validator(schema_file=stored_query_path, data=json_body, validate_at='/params') stored_query_source = _preprocess_stored_query(stored_query['query'], stored_query) if 'ws_ids' in stored_query_source: # Fetch any authorized workspace IDs using a KBase auth token, if present auth_token = auth.get_auth_header() json_body['ws_ids'] = auth.get_workspace_ids(auth_token) resp_body = arango_client.run_query(query_text=stored_query_source, bind_vars=json_body, batch_size=batch_size, full_count=full_count) return flask.jsonify(resp_body) if 'cursor_id' in flask.request.args: # Run a query from a cursor ID cursor_id = flask.request.args['cursor_id'] resp_body = arango_client.run_query(cursor_id=cursor_id) return flask.jsonify(resp_body) # No valid options were passed raise InvalidParameters('Pass in a query name or a cursor_id')
def test_array_validation(self, schema_arg=None, schema_file_arg=None): """ check array validation and default population works correctly when refs are used The current implementation of the population of defaults does not allow defaults to be populated if the property is a reference, i.e. 'properties': { 'fruits': { '$ref': '...' } } """ # skip if the test is not being called from test_json_validation if schema_arg is None and schema_file_arg is None: self.assertTrue(True) return # test the use of refs when populating defaults tests = [ { 'fruits': fruit_ref, 'name': 'using fruit.yaml -- array item is a ref', 'output': { 'params': { 'name': 'name', 'distance': 1, 'fruits': [] } } }, { # N.b. the default does not get populated in this case! # This is a change from the expected functionality 'fruits': fruits_array_ref, 'name': 'using fruits_array.yaml -- the array is a ref', 'output': { 'params': { 'name': 'name', 'distance': 1, } } }, { 'fruits': fruits_explicit, 'name': 'with no references', 'output': { 'params': { 'name': 'name', 'distance': 1, 'fruits': [] } } } ] for t in tests: with self.subTest(desc=t['name']): test_schema['properties']['params']['properties'][ 'fruits'] = t['fruits'] output = run_validator(schema=test_schema, data={'params': { 'name': 'name' }}) self.assertEqual(output, t['output']) # restore the original value test_schema['properties']['params']['properties'][ 'fruits'] = fruits_explicit
def test_non_validation_validator_errors(self): '''test errors in the validator that are unrelated to the validation functionality''' err_str = "Please supply either a schema or a schema file path" with self.assertRaisesRegex(ValueError, err_str): run_validator() with self.assertRaisesRegex(ValueError, err_str): run_validator(data={}) # only supply one of schema or schema_file with self.assertRaisesRegex(ValueError, err_str): run_validator(schema={}, schema_file='/path/to/file') err_str = "Please supply either a data structure or a data file path" with self.assertRaisesRegex(ValueError, err_str): run_validator(schema={}) with self.assertRaisesRegex(ValueError, err_str): run_validator(schema={}, data={}, data_file='') with self.assertRaisesRegex(ValueError, err_str): run_validator(schema={}, data=None, data_file=None) # invalid file type test_file = os_path.join(*(test_data_dirs + ['test_file.md'])) err_msg = f'Unknown file type encountered: {test_file}' with self.assertRaisesRegex(TypeError, err_msg): run_validator(schema_file=test_file, data={}) # invalid jsonpointer string - note the grammar error is from jsonpointer err_str = 'location must starts with /' json_loc = 'start validating here' with self.assertRaisesRegex(JsonPointerException, err_str): run_validator(schema=test_schema, data={}, validate_at=json_loc) # invalid jsonpointer ref err_str = "member 'property' not found in" json_loc = '/properties/params/property' with self.assertRaisesRegex(JsonPointerException, err_str): run_validator(schema=test_schema, data={}, validate_at=json_loc) # finally!! output = run_validator(schema=test_schema, data={ 'name': 'name', 'distance': 3 }, validate_at=valid_json_loc) self.assertEqual(output, { **schema_defaults, **{ 'name': 'name', 'distance': 3 } })