def _read_from_hdx(self, object_type, value, fieldname='id', action=None, **kwargs): # type: (str, str, str, Optional[str], Any) -> Tuple[bool, Union[Dict, str]] """Makes a read call to HDX passing in given parameter. Args: object_type (str): Description of HDX object type (for messages) value (str): Value of HDX field fieldname (str): HDX field name. Defaults to id. action (Optional[str]): Replacement CKAN action url to use. Defaults to None. **kwargs: Other fields to pass to CKAN. Returns: Tuple[bool, Union[Dict, str]]: (True/False, HDX object metadata/Error) """ if not fieldname: raise HDXError('Empty %s field name!' % object_type) if action is None: action = self.actions()['show'] data = {fieldname: value} data.update(kwargs) try: result = self.configuration.call_remoteckan(action, data) return True, result except NotFound: return False, '%s=%s: not found!' % (fieldname, value) except Exception as e: raisefrom(HDXError, 'Failed when trying to read: %s=%s! (POST)' % (fieldname, value), e)
def stream_file(self, url, folder=None): # type: (str, Optional[str]) -> str """Stream file from url and store in provided folder or temporary folder if no folder supplied. Must call setup_streaming_download method first. Args: url (str): URL to download folder (Optional[str]): Folder to download it to. Defaults to None (temporary folder). Returns: str: Path of downloaded file """ path = self.get_path_for_url(url, folder) f = None try: f = open(path, 'wb') for chunk in self.response.iter_content(chunk_size=10240): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() return f.name except Exception as e: raisefrom(DownloadError, 'Download of %s failed in retrieval of stream!' % url, e) finally: if f: f.close()
def _write_to_hdx(self, action, data, id_field_name=None, file_to_upload=None): # type: (str, Dict, str, Optional[str]) -> Dict """Creates or updates an HDX object in HDX and return HDX object metadata dict Args: action (str): Action to perform eg. 'create', 'update' data (Dict): Data to write to HDX id_field_name (Optional[str]): Name of field containing HDX object identifier. Defaults to None. file_to_upload (Optional[str]): File to upload to HDX. Defaults to None. Returns: Dict: HDX object metadata """ file = None try: if file_to_upload: file = open(file_to_upload, 'rb') files = [('upload', file)] else: files = None return self.configuration.call_remoteckan(self.actions()[action], data, files=files) except Exception as e: if id_field_name: idstr = ' %s' % data[id_field_name] else: idstr = '' raisefrom(HDXError, 'Failed when trying to %s%s! (POST)' % (action, idstr), e) finally: if file_to_upload and file: file.close()
def get_tabular_stream(self, url, **kwargs): # type: (str, Any) -> tabulator.Stream """Get Tabulator stream. Args: url (str): URL to download **kwargs: headers (Union[int, List[int], List[str]]): Number of row(s) containing headers or list of headers file_type (Optional[str]): Type of file. Defaults to inferring. delimiter (Optional[str]): Delimiter used for values in each row. Defaults to inferring. Returns: tabulator.Stream: Tabulator Stream object """ self.close_response() file_type = kwargs.get('file_type') if file_type is not None: kwargs['format'] = file_type del kwargs['file_type'] try: self.response = tabulator.Stream(url, **kwargs) self.response.open() return self.response except TabulatorException as e: raisefrom(DownloadError, 'Getting tabular stream for %s failed!' % url, e)
def setup_stream(self, url, timeout=None): # type: (str, Optional[float]) -> None """Setup streaming download from provided url Args: url (str): URL to download timeout (Optional[float]): Timeout for connecting to URL. Defaults to None (no timeout). """ self.response = None try: self.response = self.session.get(url, stream=True, timeout=timeout) self.response.raise_for_status() except Exception as e: raisefrom(DownloadError, 'Setup of Streaming Download of %s failed!', e)
def validlocations(self): # type: () -> List[Dict] """ Return valid locations Returns: List[Dict]: Valid locations """ try: return self._validlocationsfn() except (AttributeError, TypeError) as e: raisefrom( ConfigurationError, 'There is no valid locations function set up! Use Configuration.create(**kwargs)', e)
def download(self, url, timeout=None): # type: (str, Optional[float]) -> requests.Response """Download url Args: url (str): URL to download timeout (Optional[float]): Timeout for connecting to URL. Defaults to None (no timeout). Returns: requests.Response: Response """ try: self.response = self.session.get(url, timeout=timeout) self.response.raise_for_status() except Exception as e: raisefrom(DownloadError, 'Download of %s failed!' % url, e) return self.response
def hash_stream(self, url): # type: (str) -> str """Stream file from url and hash it using MD5. Must call setup_streaming_download method first. Args: url (str): URL to download Returns: str: MD5 hash of file """ md5hash = hashlib.md5() try: for chunk in self.response.iter_content(chunk_size=10240): if chunk: # filter out keep-alive new chunks md5hash.update(chunk) return md5hash.hexdigest() except Exception as e: raisefrom(DownloadError, 'Download of %s failed in retrieval of stream!' % url, e)
def setup(self, url, stream=True, post=False, parameters=None, timeout=None): # type: (str, bool, bool, Optional[Dict], Optional[float]) -> requests.Response """Setup download from provided url returning the response Args: url (str): URL to download stream (bool): Whether to stream download. Defaults to True. post (bool): Whether to use POST instead of GET. Defaults to False. parameters (Optional[Dict]): Parameters to pass. Defaults to None. timeout (Optional[float]): Timeout for connecting to URL. Defaults to None (no timeout). Returns: requests.Response: requests.Response object """ self.close_response() self.response = None try: if post: full_url, parameters = self.get_url_params_for_post( url, parameters) self.response = self.session.post(full_url, data=parameters, stream=stream, timeout=timeout) else: self.response = self.session.get(self.get_url_for_get( url, parameters), stream=stream, timeout=timeout) self.response.raise_for_status() except Exception as e: raisefrom(DownloadError, 'Setup of Streaming Download of %s failed!', e) return self.response
def _parse_date(dataset_date, date_format): # type: (str, Optional[str]) -> datetime """Parse dataset date from string using specified format. If no format is supplied, the function will guess. For unambiguous formats, this should be fine. Args: dataset_date (str): Dataset date string date_format (Optional[str]): Date format. If None is given, will attempt to guess. Defaults to None. Returns: datetime.datetime """ if date_format is None: try: return parser.parse(dataset_date) except (ValueError, OverflowError) as e: raisefrom(HDXError, 'Invalid dataset date!', e) else: try: return datetime.strptime(dataset_date, date_format) except ValueError as e: raisefrom(HDXError, 'Invalid dataset date!', e)
def create_datastore(self, schema=None, primary_key=None, delete_first=0, path=None): # type: (Optional[List[Dict]], Optional[str], int, Optional[str]) -> None """For csvs, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX. Args: schema (List[Dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None. primary_key (Optional[str]): Primary key of schema. Defaults to None. delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0. path (Optional[str]): Local path to file that was uploaded. Defaults to None. Returns: None """ if delete_first == 0: pass elif delete_first == 1: self.delete_datastore() elif delete_first == 2: if primary_key is None: self.delete_datastore() else: raise HDXError('delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)') if path is None: # Download the resource url, path = self.download() delete_after_download = True else: url = self.data.get('url', None) if not url: raise HDXError('No URL to download!') delete_after_download = False zip_path = None stream = None try: extension = splitext(path)[1] if extension.lower() == '.zip': zip_file = zipfile.ZipFile(path) filename = zip_file.namelist()[0] tempdir = gettempdir() zip_file.extract(filename, tempdir) zip_path = path path = join(tempdir, filename) def convert_to_text(extended_rows): for number, headers, row in extended_rows: for i, val in enumerate(row): row[i] = str(val) yield (number, headers, row) stream = Stream(path, headers=1, post_parse=[convert_to_text], bytes_sample_size=1000000) stream.open() nonefieldname = False if schema is None: schema = list() for fieldname in stream.headers: if fieldname is not None: schema.append({'id': fieldname, 'type': 'text'}) else: nonefieldname = True data = {'resource_id': self.data['id'], 'force': True, 'fields': schema, 'primary_key': primary_key} self._write_to_hdx('datastore_create', data, 'resource_id') if primary_key is None: method = 'insert' else: method = 'upsert' logger.debug('Uploading data from %s to datastore' % url) offset = 0 chunksize = 100 rowset = stream.read(keyed=True, limit=chunksize) while len(rowset) != 0: if nonefieldname: for row in rowset: del row[None] data = {'resource_id': self.data['id'], 'force': True, 'method': method, 'records': rowset} self._write_to_hdx('datastore_upsert', data, 'resource_id') rowset = stream.read(keyed=True, limit=chunksize) logger.debug('Uploading: %s' % offset) offset += chunksize except Exception as e: raisefrom(HDXError, 'Upload to datastore of %s failed!' % url, e) finally: if stream: stream.close() if delete_after_download: unlink(path) if zip_path: unlink(zip_path) else: if zip_path: unlink(path) # ie. we keep the zip but remove the extracted file
def create_datastore(self, schema=None, primary_key=None, delete_first=0, path=None): # type: (Optional[List[Dict]], Optional[str], int, Optional[str]) -> None """For tabular data, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX. Args: schema (List[Dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None. primary_key (Optional[str]): Primary key of schema. Defaults to None. delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0. path (Optional[str]): Local path to file that was uploaded. Defaults to None. Returns: None """ if delete_first == 0: pass elif delete_first == 1: self.delete_datastore() elif delete_first == 2: if primary_key is None: self.delete_datastore() else: raise HDXError('delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)') if path is None: # Download the resource url, path = self.download() delete_after_download = True else: url = path delete_after_download = False def convert_to_text(extended_rows): for number, headers, row in extended_rows: for i, val in enumerate(row): row[i] = str(val) yield (number, headers, row) with Download(full_agent=self.configuration.get_user_agent()) as downloader: try: stream = downloader.get_tabular_stream(path, headers=1, post_parse=[convert_to_text], bytes_sample_size=1000000) nonefieldname = False if schema is None: schema = list() for fieldname in stream.headers: if fieldname is not None: schema.append({'id': fieldname, 'type': 'text'}) else: nonefieldname = True data = {'resource_id': self.data['id'], 'force': True, 'fields': schema, 'primary_key': primary_key} self._write_to_hdx('datastore_create', data, 'resource_id') if primary_key is None: method = 'insert' else: method = 'upsert' logger.debug('Uploading data from %s to datastore' % url) offset = 0 chunksize = 100 rowset = stream.read(keyed=True, limit=chunksize) while len(rowset) != 0: if nonefieldname: for row in rowset: del row[None] data = {'resource_id': self.data['id'], 'force': True, 'method': method, 'records': rowset} self._write_to_hdx('datastore_upsert', data, 'resource_id') rowset = stream.read(keyed=True, limit=chunksize) logger.debug('Uploading: %s' % offset) offset += chunksize except Exception as e: raisefrom(HDXError, 'Upload to datastore of %s failed!' % url, e) finally: if delete_after_download: remove(path)
def create_datastore(self, schema=None, primary_key=None, delete_first=0, path=None): # type: (Optional[List[Dict]], Optional[str], int, Optional[str]) -> None """For tabular data, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX. Args: schema (List[Dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None. primary_key (Optional[str]): Primary key of schema. Defaults to None. delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0. path (Optional[str]): Local path to file that was uploaded. Defaults to None. Returns: None """ if delete_first == 0: pass elif delete_first == 1: self.delete_datastore() elif delete_first == 2: if primary_key is None: self.delete_datastore() else: raise HDXError( 'delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)' ) if path is None: # Download the resource url, path = self.download() delete_after_download = True else: url = path delete_after_download = False def convert_to_text(extended_rows): for number, headers, row in extended_rows: for i, val in enumerate(row): row[i] = str(val) yield (number, headers, row) with Download( full_agent=self.configuration.get_user_agent()) as downloader: try: stream = downloader.get_tabular_stream( path, headers=1, post_parse=[convert_to_text], bytes_sample_size=1000000) nonefieldname = False if schema is None: schema = list() for fieldname in stream.headers: if fieldname is not None: schema.append({'id': fieldname, 'type': 'text'}) else: nonefieldname = True data = { 'resource_id': self.data['id'], 'force': True, 'fields': schema, 'primary_key': primary_key } self._write_to_hdx('datastore_create', data, 'resource_id') if primary_key is None: method = 'insert' else: method = 'upsert' logger.debug('Uploading data from %s to datastore' % url) offset = 0 chunksize = 100 rowset = stream.read(keyed=True, limit=chunksize) while len(rowset) != 0: if nonefieldname: for row in rowset: del row[None] data = { 'resource_id': self.data['id'], 'force': True, 'method': method, 'records': rowset } self._write_to_hdx('datastore_upsert', data, 'resource_id') rowset = stream.read(keyed=True, limit=chunksize) logger.debug('Uploading: %s' % offset) offset += chunksize except Exception as e: raisefrom(HDXError, 'Upload to datastore of %s failed!' % url, e) finally: if delete_after_download: remove(path)