def get_data_source(self, data_source_id): # Retrieve metadata from Limonero. limonero_config = \ self.parameters['configuration']['juicer']['services']['limonero'] metadata = limonero_service.get_data_source_info( limonero_config['url'], str(limonero_config['auth_token']), str(data_source_id)) if not metadata.get('url'): raise ValueError( gettext('Incorrect data source configuration (empty url)')) return metadata
def test_get_all_data_sources_success(mocked_get): data_source_id = 700 text = { 'id': data_source_id, 'name': 'Data source for testing', 'url': 'hdfs://test.com/testing.csv' } mocked_get.side_effect = fake_req(200, json.dumps(text))() url = 'http://limonero/' token = '00000' resp = limonero_service.get_data_source_info(url, token, '') for k, v in resp.items(): assert v == text[k] mocked_get.assert_called_with('http://limonero/datasources/', headers={'X-Auth-Token': '00000'})
def __init__(self, parameters, named_inputs, named_outputs): if parameters.get('type') in ['polygon', 'geojson']: limonero_config = parameters['configuration']['juicer'][ 'services']['limonero'] url = limonero_config['url'] token = str(limonero_config['auth_token']) metadata = limonero_service.get_data_source_info( url, token, parameters.get('polygon')) if not metadata.get('url'): raise ValueError( _('Incorrect data source configuration (empty url or ' 'not GEOJSON)')) else: parameters['polygon_url'] = metadata['url'] VisualizationMethodOperation.__init__(self, parameters, named_inputs, named_outputs)
def _set_data_source_parameters(self, parameters): self.data_source_id = int(parameters[self.DATA_SOURCE_ID_PARAM]) # Retrieve metadata from Limonero. limonero_config = self.parameters['configuration']['juicer'][ 'services']['limonero'] url = limonero_config['url'] token = str(limonero_config['auth_token']) # Is data source information cached? self.metadata = self.parameters.get('workflow', {}).get( 'data_source_cache', {}).get(self.data_source_id) if self.metadata is None: self.metadata = limonero_service.get_data_source_info( url, token, self.data_source_id) self.parameters['workflow']['data_source_cache'][ self.data_source_id] = self.metadata if not self.metadata.get('url'): raise ValueError( _('Incorrect data source configuration (empty url)')) self.header = parameters.get(self.HEADER_PARAM, False) not in ('0', 0, 'false', False) self.null_values = [ v.strip() for v in parameters.get(self.NULL_VALUES_PARAM, '').split(",") if v.strip() ] self.sep = parameters.get( self.SEPARATOR_PARAM, self.metadata.get('attribute_delimiter', ',')) or ',' if self.metadata['format'] == 'TEXT': self.sep = '{new_line}' self.quote = parameters.get(self.QUOTE_PARAM, self.metadata.get('text_delimiter')) if self.quote == '\'': self.quote = '\\\'' if self.sep in self.SEPARATORS: self.sep = self.SEPARATORS[self.sep] self.infer_schema = parameters.get(self.INFER_SCHEMA_PARAM, self.INFER_FROM_LIMONERO) self.mode = parameters.get(self.MODE_PARAM, 'FAILFAST')
def test_get_data_source_info_failure(mocked_get): data_source_id = 700 text = { 'id': data_source_id, 'name': 'Data source for testing', 'url': 'hdfs://test.com/testing.csv' } mocked_get.side_effect = fake_req(201, json.dumps(text))() url = 'http://limonero/datasources' token = '00000' with pytest.raises(ValueError): resp = limonero_service.get_data_source_info(url, token, data_source_id) mocked_get.assert_called_with( 'http://limonero/datasources/{}'.format(data_source_id), headers={'X-Auth-Token': '00000'}) for k, v in resp.items(): assert v == text[k]
def perform_copy(config, vallum_ds_id, target_id, path): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) services_config = config.get('juicer').get('services') limonero_config = services_config.get('limonero') limonero_url = limonero_config.get('url') token = str(limonero_config.get('auth_token')) vallum_ds = limonero_service.get_data_source_info(limonero_url, token, vallum_ds_id) vallum_storage = vallum_ds.get('storage', {}) if vallum_storage.get('type') != 'VALLUM': return {'status': 'ERROR', 'message': 'Storage is not VALLUM'} target_storage = limonero_service.get_storage_info(limonero_url, token, target_id) if target_storage.get('type') != 'LOCAL': return { 'status': 'ERROR', 'message': 'Target storage must be of type LOCAL' } parsed = urlparse(vallum_storage.get('url')) base_url = '{}://{}:{}'.format(parsed.scheme, parsed.hostname, parsed.port or 80) url = base_url + parsed.path qs = parse_qs(parsed.query) database = qs.get('db', 'samples')[0] username = parsed.username password = parsed.password query = vallum_ds['command'] mode = 'MN' thread = 1 params = { "username": username, "password": password, "database": database, "mode": mode, "query": query, "thread": thread, } req = requests.post(url, params, verify=False) total = 0 if req.status_code == 200: parsed_local = urlparse(target_storage.get('url')) target_dir = parsed_local.path + path # '/vallum' + str(vallum_ds_id) obj = json.loads(req.text) for result in obj.get('result'): files = result.get('files') if files: uri_files = [ base_url + urlparse(f.get('uri')).path for f in files ] if not os.path.exists(target_dir): os.makedirs(target_dir) for vallum_file in uri_files: file_req = requests.get(vallum_file, params, verify=False) if file_req.status_code == 200: final_filename = target_dir + '/' + \ vallum_file.split('/')[-1] print(final_filename) total += 1 with open(final_filename, 'wb') as fout: fout.write(file_req.content) else: raise ValueError('HTTP Status ' + file_req.status_code) return total else: raise ValueError('HTTP Status ' + req.status_code)
def _build_privacy_restrictions(self): if 'juicer' not in self.config or \ 'services' not in self.config['juicer']: return limonero_config = self.config['juicer']['services']['limonero'] data_sources = [] if self.workflow['platform']['slug'] != 'spark': return for t in self.workflow['tasks']: if t['operation'].get('slug') == 'data-reader': if self._query_data_sources: ds = next(self._query_data_sources()) else: ds = limonero_service.get_data_source_info( limonero_config['url'], str(limonero_config['auth_token']), t['forms']['data_source']['value']) data_sources.append(ds) privacy_info = {} attribute_group_set = collections.defaultdict(list) data_source_cache = {} for ds in data_sources: data_source_cache[ds['id']] = ds attrs = [] privacy_info[ds['id']] = {'attributes': attrs} for attr in ds['attributes']: privacy = attr.get('attribute_privacy', {}) or {} attribute_privacy_group_id = privacy.get( 'attribute_privacy_group_id') privacy_config = { 'id': attr['id'], 'name': attr['name'], 'type': attr['type'], 'details': privacy.get('hierarchy'), 'privacy_type': privacy.get('privacy_type'), 'anonymization_technique': privacy.get('anonymization_technique'), 'attribute_privacy_group_id': attribute_privacy_group_id } attrs.append(privacy_config) if attribute_privacy_group_id: attribute_group_set[attribute_privacy_group_id].append( privacy_config) # print('#' * 40) # print(attr.get('name'), attr.get('type')) # print(privacy.get('privacy_type'), # privacy.get('anonymization_technique'), # privacy.get('attribute_privacy_group_id')) def sort_attr_privacy(a): return privaaas.ANONYMIZATION_TECHNIQUES[a.get( 'anonymization_technique', 'NO_TECHNIQUE')] for attributes in list(attribute_group_set.values()): more_restrictive = sorted(attributes, key=sort_attr_privacy, reverse=True)[0] # print(json.dumps(more_restrictive[0], indent=4)) # Copy all privacy config from more restrictive one for attribute in attributes: attribute.update(more_restrictive) self.workflow['data_source_cache'] = data_source_cache self.workflow['privacy_restrictions'] = privacy_info