def _handle_esri_errors(self, response, error_message): if response.status_code != 200: raise EsriDownloadError('{}: {} HTTP {} {}'.format( response.request.url, error_message, response.status_code, response.text, )) try: data = response.json() except: self._logger.error( "Could not parse response from {} as JSON:\n\n{}".format( response.request.url, response.text, )) raise error = data.get('error') if error: raise EsriDownloadError("{}: {} {}".format( error_message, error['message'], ', '.join(error['details']), )) return data
def test_download_handles_no_count(self): """ ESRI Caching Will Handle A Server Without returnCountOnly Support """ task = EsriRestDownloadTask('us-fl-palmbeach') with patch('esridump.EsriDumper.get_metadata') as metadata_patch: metadata_patch.return_value = {'fields': []} with patch( 'esridump.EsriDumper.get_feature_count') as feature_patch: feature_patch.side_effect = EsriDownloadError( "Server doesn't support returnCountOnly") with self.assertRaises(EsriDownloadError) as e: task.download(['http://example.com/'], self.workdir, SourceConfig( dict({ "schema": 2, "layers": { "addresses": [{ "name": "default", "conform": { "number": "num", "street": "str" } }] } }), "addresses", "default")) # This is the expected exception at this point self.assertEqual( e.message, "Could not find object ID field name for deduplication" )
def test_download_handles_no_count(self): """ ESRI Caching Will Handle A Server Without returnCountOnly Support """ task = EsriRestDownloadTask('us-fl-palmbeach') with patch('esridump.EsriDumper.get_metadata') as metadata_patch: metadata_patch.return_value = {'fields': []} with patch('esridump.EsriDumper.get_feature_count') as feature_patch: feature_patch.side_effect = EsriDownloadError("Server doesn't support returnCountOnly") with self.assertRaises(EsriDownloadError) as e: task.download(['http://example.com/'], self.workdir) # This is the expected exception at this point self.assertEqual(e.message, "Could not find object ID field name for deduplication")
def _get_layer_oids(self): query_args = self._build_query_args({ 'where': '1=1', # So we get everything 'returnIdsOnly': 'true', 'f': 'json', }) url = self._build_url('/query') headers = self._build_headers() response = self._request('GET', url, params=query_args, headers=headers) oid_data = self._handle_esri_errors(response, "Could not retrieve object IDs") oids = oid_data.get('objectIds') if not oids: raise EsriDownloadError("Server doesn't support returnIdsOnly") return oids
def get_feature_count(self): query_args = self._build_query_args({ 'where': '1=1', 'returnCountOnly': 'true', 'f': 'json', }) headers = self._build_headers() url = self._build_url('/query') response = self._request('GET', url, params=query_args, headers=headers) count_json = self._handle_esri_errors(response, "Could not retrieve row count") count = count_json.get('count') if not count: raise EsriDownloadError("Server doesn't support returnCountOnly") return count_json['count']
def _get_layer_min_max(self, oid_field_name): """ Find the min and max values for the OID field. """ query_args = self._build_query_args({ 'f': 'json', 'outFields': '', 'outStatistics': json.dumps([ dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'), dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'), ], separators=(',', ':')) }) headers = self._build_headers() url = self._build_url('/query') response = self._request('GET', url, params=query_args, headers=headers) metadata = self._handle_esri_errors(response, "Could not retrieve min/max oid values") # Some servers (specifically version 10.11, it seems) will respond with SQL statements # for the attribute names rather than the requested field names, so pick the min and max # deliberately rather than relying on the names. min_max_values = metadata['features'][0]['attributes'].values() min_value = min(min_max_values) max_value = max(min_max_values) query_args = self._build_query_args({ 'f': 'json', 'outFields': '*', 'outStatistics': json.dumps([ dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'), dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'), ], separators=(',', ':')) }) query_args = self._build_query_args({ 'where': '{} = {} OR {} = {}'.format( oid_field_name, min_value, oid_field_name, max_value ), 'returnIdsOnly': 'true', 'f': 'json', }) headers = self._build_headers() url = self._build_url('/query') response = self._request('GET', url, params=query_args, headers=headers) oid_data = self._handle_esri_errors(response, "Could not check min/max values") if not oid_data or not oid_data.get('objectIds') or min_value not in oid_data['objectIds'] or max_value not in oid_data['objectIds']: raise EsriDownloadError('Server returned invalid min/max') return (min_value, max_value)
def _get_layer_oids(self): query_args = self._build_query_args({ "where": "1=1", # So we get everything "returnIdsOnly": "true", "f": "json", }) url = self._build_url("/query") headers = self._build_headers() response = self._request("GET", url, params=query_args, headers=headers) oid_data = self._handle_esri_errors(response, "Could not retrieve object IDs") oids = oid_data.get("objectIds") if not oids: raise EsriDownloadError("Server doesn't support returnIdsOnly") return oids
def get_feature_count(self): query_args = self._build_query_args({ "where": "1=1", "returnCountOnly": "true", "f": "json", }) headers = self._build_headers() url = self._build_url("/query") response = self._request("GET", url, params=query_args, headers=headers) count_json = self._handle_esri_errors(response, "Could not retrieve row count") count = count_json.get("count") if not count: raise EsriDownloadError("Server doesn't support returnCountOnly") return count_json["count"]
def __iter__(self): query_fields = self._fields metadata = self.get_metadata() page_size = min(1000, metadata.get('maxRecordCount', 500)) geometry_type = metadata.get('geometryType') row_count = None try: row_count = self.get_feature_count() except EsriDownloadError: self._logger.info("Source does not support feature count") page_args = [] if row_count is not None and (metadata.get('supportsPagination') or \ (metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination'])): # If the layer supports pagination, we can use resultOffset/resultRecordCount to paginate # There's a bug where some servers won't handle these queries in combination with a list of # fields specified. We'll make a single, 1 row query here to check if the server supports this # and switch to querying for all fields if specifying the fields fails. if query_fields and not self.can_handle_pagination(query_fields): self._logger.info( "Source does not support pagination with fields specified, so querying for all fields." ) query_fields = None for offset in range(self._startWith, row_count, page_size): query_args = self._build_query_args({ 'resultOffset': offset, 'resultRecordCount': page_size, 'where': '1=1', 'geometryPrecision': self._precision, 'returnGeometry': self._request_geometry, 'outSR': self._outSR, 'outFields': ','.join(query_fields or ['*']), 'f': 'json', }) page_args.append(query_args) self._logger.info("Built %s requests using resultOffset method", len(page_args)) else: # If not, we can still use the `where` argument to paginate use_oids = True oid_field_name = self._find_oid_field_name(metadata) if not oid_field_name: raise EsriDownloadError( "Could not find object ID field name for deduplication") if metadata.get('supportsStatistics'): # If the layer supports statistics, we can request maximum and minimum object ID # to help build the pages try: (oid_min, oid_max) = self._get_layer_min_max(oid_field_name) for page_min in range(oid_min - 1, oid_max, page_size): page_max = min(page_min + page_size, oid_max) query_args = self._build_query_args({ 'where': '{} > {} AND {} <= {}'.format( oid_field_name, page_min, oid_field_name, page_max, ), 'geometryPrecision': self._precision, 'returnGeometry': self._request_geometry, 'outSR': self._outSR, 'outFields': ','.join(query_fields or ['*']), 'f': 'json', }) page_args.append(query_args) self._logger.info( "Built {} requests using OID where clause method". format(len(page_args))) # If we reach this point we don't need to fall through to enumerating all object IDs # because the statistics method worked use_oids = False except EsriDownloadError: self._logger.exception( "Finding max/min from statistics failed. Trying OID enumeration." ) if use_oids: # If the layer does not support statistics, we can request # all the individual IDs and page through them one chunk at # a time. try: oids = sorted(map(int, self._get_layer_oids())) for i in range(0, len(oids), page_size): oid_chunk = oids[i:i + page_size] page_min = oid_chunk[0] page_max = oid_chunk[-1] query_args = self._build_query_args({ 'where': '{} >= {} AND {} <= {}'.format( oid_field_name, page_min, oid_field_name, page_max, ), 'geometryPrecision': self._precision, 'returnGeometry': self._request_geometry, 'outSR': self._outSR, 'outFields': ','.join(query_fields or ['*']), 'f': 'json', }) page_args.append(query_args) self._logger.info( "Built %s requests using OID enumeration method", len(page_args)) except EsriDownloadError: self._logger.info("Falling back to geo queries") # Use geospatial queries when none of the ID-based methods will work bounds = metadata['extent'] saved = set() for feature in self._scrape_an_envelope( bounds, self._outSR, page_size): attrs = feature['attributes'] oid = attrs.get(oid_field_name) if oid in saved: continue yield esri2geojson(feature) saved.add(oid) return query_url = self._build_url('/query') headers = self._build_headers() for query_args in page_args: try: response = self._request('POST', query_url, headers=headers, data=query_args) data = self._handle_esri_errors( response, "Could not retrieve this chunk of objects") except socket.timeout as e: raise EsriDownloadError("Timeout when connecting to URL", e) except ValueError as e: raise EsriDownloadError("Could not parse JSON", e) except Exception as e: raise EsriDownloadError("Could not connect to URL", e) error = data.get('error') if error: raise EsriDownloadError( "Problem querying ESRI dataset with args {}. Server said: {}" .format(query_args, error['message'])) features = data.get('features') for feature in features: yield esri2geojson(feature)
def _get_layer_min_max(self, oid_field_name): """ Find the min and max values for the OID field. """ query_args = self._build_query_args({ "f": "json", "outFields": "", "outStatistics": json.dumps( [ dict( statisticType="min", onStatisticField=oid_field_name, outStatisticFieldName="THE_MIN", ), dict( statisticType="max", onStatisticField=oid_field_name, outStatisticFieldName="THE_MAX", ), ], separators=(",", ":"), ), }) headers = self._build_headers() url = self._build_url("/query") response = self._request("GET", url, params=query_args, headers=headers) metadata = self._handle_esri_errors( response, "Could not retrieve min/max oid values") # Some servers (specifically version 10.11, it seems) will respond with SQL statements # for the attribute names rather than the requested field names, so pick the min and max # deliberately rather than relying on the names. min_max_values = metadata["features"][0]["attributes"].values() min_value = min(min_max_values) max_value = max(min_max_values) query_args = self._build_query_args({ "f": "json", "outFields": "*", "outStatistics": json.dumps( [ dict( statisticType="min", onStatisticField=oid_field_name, outStatisticFieldName="THE_MIN", ), dict( statisticType="max", onStatisticField=oid_field_name, outStatisticFieldName="THE_MAX", ), ], separators=(",", ":"), ), }) query_args = self._build_query_args({ "where": "{} = {} OR {} = {}".format(oid_field_name, min_value, oid_field_name, max_value), "returnIdsOnly": "true", "f": "json", }) headers = self._build_headers() url = self._build_url("/query") response = self._request("GET", url, params=query_args, headers=headers) oid_data = self._handle_esri_errors(response, "Could not check min/max values") if (not oid_data or not oid_data.get("objectIds") or min_value not in oid_data["objectIds"] or max_value not in oid_data["objectIds"]): raise EsriDownloadError("Server returned invalid min/max") return (min_value, max_value)