class CkanLoader(object): """ Directs a CKAN service client to put obtained datasets on CKAN. """ usage = '''usage: %prog OPTIONS''' def __init__(self): """Sets up options and init the CKAN service client.""" parser = OptionParser(self.usage) self.add_options(parser) (self.options, self.args) = parser.parse_args() self.init_ckanclient() def add_options(self, parser): """Adds options for CKAN serice location and REST API key.""" parser.add_option( '--ckan-api-location', dest='ckan_api_location', default='http://127.0.0.1:5000/api', help="""The location of working CKAN REST API.""") parser.add_option( '--ckan-api-key', dest='ckan_api_key', help="""A valid CKAN REST API key.""") parser.add_option( '--no-create-confirmation', dest='no_create_confimation', action='store_true', help="""Don't prompt for confirmation when registering a new dataset.""") parser.add_option( '--no-update-confirmation', dest='no_update_confimation', action='store_true', help="""Don't prompt for confirmation when updating a registered dataset.""") def init_ckanclient(self): """Init the CKAN client from options.""" if not self.options.ckan_api_location: print "Warning: CKAN API location not provided." if not self.options.ckan_api_key: print "Warning: CKAN API key not provided." self.ckanclient = CkanClient( base_location=self.options.ckan_api_location, api_key=self.options.ckan_api_key, ) def run(self): """Obtain datasets and put them on CKAN.""" try: self.datasets = [] self.obtain_datasets() print "Putting %s datasets on CKAN running at %s" % (len(self.datasets), self.options.ckan_api_location) self.put_datasets_on_ckan() except KeyboardInterrupt: print "" print "exiting..." print "" def obtain_datasets(self): """Abstract method for obtaining datasets.""" raise Exception, "Abstract method not implemented." def put_datasets_on_ckan(self): """Uses CKAN client to register (or update) obtained datasets.""" # Todo: Fix ckan or ckanclient, so this method isn't so long-winded. print "" sleep(1) for dataset in self.datasets: try: registered_dataset = self.ckanclient.dataset_entity_get(dataset['name']) except CkanApiError: pass if self.ckanclient.last_status == 200: print "Dataset '%s' is already registered" % dataset['name'] print "" pprint.pprint(dataset) print "" if not self.options.no_update_confimation: answer = raw_input("Do you want to update this dataset with CKAN now? [y/N] ") if not answer or answer.lower()[0] != 'y': print "Skipping '%s' dataset..." % dataset['name'] print "" sleep(1) continue print "Updating dataset..." self.ckanclient.dataset_entity_put(dataset) if self.ckanclient.last_status == 200: print "Updated dataset '%s' OK." % dataset['name'] sleep(1) elif self.ckanclient.last_status == 403 or '403' in str(self.ckanclient.last_url_error): print "Error: Not authorised. Check your API key." sleep(1) sleep(1) sleep(1) sleep(1) elif self.ckanclient.last_http_error: print "Error: CKAN returned status code %s: %s" % ( self.ckanclient.last_status, self.ckanclient.last_http_error) sleep(1) sleep(1) sleep(1) elif self.ckanclient.last_url_error: print "Error: URL problems: %s" % self.ckanclient.last_url_error sleep(1) sleep(1) sleep(1) else: raise Exception, "Error: CKAN request didn't work at all." elif self.ckanclient.last_status == 404 or '404' in str(self.ckanclient.last_url_error): print "Dataset '%s' not currently registered" % dataset['name'] print "" pprint.pprint(dataset) print "" if not self.options.no_create_confimation: answer = raw_input("Do you want to register this dataset with CKAN now? [y/N] ") if not answer or answer.lower()[0] != 'y': print "Skipping '%s' dataset..." % dataset['name'] print "" sleep(1) continue print "Registering dataset..." self.ckanclient.dataset_register_post(dataset) if self.ckanclient.last_status in [200, 201]: print "Registered dataset '%s' OK." % dataset['name'] sleep(1) elif self.ckanclient.last_status == 403 or '403' in str(self.ckanclient.last_url_error): print "Error: Not authorised. Check your API key." sleep(1) sleep(1) sleep(1) sleep(1) elif self.ckanclient.last_http_error: print "Error: CKAN returned status code %s: %s" % ( self.ckanclient.last_status, self.ckanclient.last_http_error) sleep(1) sleep(1) sleep(1) elif self.ckanclient.last_url_error: print "Error: URL problems: %s" % self.ckanclient.last_url_error sleep(1) sleep(1) sleep(1) else: raise Exception, "Error: CKAN request didn't work at all." elif self.ckanclient.last_http_error: print "Error: CKAN returned status code %s: %s" % ( self.ckanclient.last_status, self.ckanclient.last_http_error) sleep(1) sleep(1) sleep(1) elif self.ckanclient.last_url_error: print "Error: URL problems: %s" % self.ckanclient.last_url_error sleep(1) sleep(1) sleep(1) else: raise Exception, "Error: CKAN request didn't work at all." def create_dataset(self, name, title='', url='', maintainer='', maintainer_email='', author='', author_email='', notes='', tags=[], extras={}, license_id=None, license=None, resources=[]): """Returns a CKAN REST API dataset from method arguments.""" if not isinstance(tags, list): raise Exception, "Dataset tags must be a list: %s" % tags if not isinstance(extras, dict): raise Exception, "Dataset extras must be a dict: %s" % tags dataset = {} dataset['name'] = self.coerce_dataset_name(name) dataset['title'] = title dataset['url'] = url dataset['notes'] = notes dataset['maintainer'] = maintainer dataset['maintainer_email'] = maintainer_email dataset['author'] = author dataset['author_email'] = author_email dataset['tags'] = tags dataset['extras'] = extras # Pre and post licenses servicization. if license_id != None: dataset['license_id'] = license_id elif license != None: dataset['license'] = license dataset['resources'] = resources return dataset def coerce_dataset_name(self, name): """Converts unicode string to valid CKAN dataset name.""" # Todo: Probably needs to be finished off. name = self.substitute_ascii_equivalents(name) name = name.lower() return name def substitute_ascii_equivalents(self, unicrap): # Method taken from: http://code.activestate.com/recipes/251871/ """This takes a UNICODE string and replaces Latin-1 characters with something equivalent in 7-bit ASCII. It returns a plain ASCII string. This function makes a best effort to convert Latin-1 characters into ASCII equivalents. It does not just strip out the Latin-1 characters. All characters in the standard 7-bit ASCII range are preserved. In the 8th bit range all the Latin-1 accented letters are converted to unaccented equivalents. Most symbol characters are converted to something meaningful. Anything not converted is deleted. """ xlate={0xc0:'A', 0xc1:'A', 0xc2:'A', 0xc3:'A', 0xc4:'A', 0xc5:'A', 0xc6:'Ae', 0xc7:'C', 0xc8:'E', 0xc9:'E', 0xca:'E', 0xcb:'E', 0xcc:'I', 0xcd:'I', 0xce:'I', 0xcf:'I', 0xd0:'Th', 0xd1:'N', 0xd2:'O', 0xd3:'O', 0xd4:'O', 0xd5:'O', 0xd6:'O', 0xd8:'O', 0xd9:'U', 0xda:'U', 0xdb:'U', 0xdc:'U', 0xdd:'Y', 0xde:'th', 0xdf:'ss', 0xe0:'a', 0xe1:'a', 0xe2:'a', 0xe3:'a', 0xe4:'a', 0xe5:'a', 0xe6:'ae', 0xe7:'c', 0xe8:'e', 0xe9:'e', 0xea:'e', 0xeb:'e', 0xec:'i', 0xed:'i', 0xee:'i', 0xef:'i', 0xf0:'th', 0xf1:'n', 0xf2:'o', 0xf3:'o', 0xf4:'o', 0xf5:'o', 0xf6:'o', 0xf8:'o', 0xf9:'u', 0xfa:'u', 0xfb:'u', 0xfc:'u', 0xfd:'y', 0xfe:'th', 0xff:'y', #0xa1:'!', 0xa2:'{cent}', 0xa3:'{pound}', 0xa4:'{currency}', #0xa5:'{yen}', 0xa6:'|', 0xa7:'{section}', 0xa8:'{umlaut}', #0xa9:'{C}', 0xaa:'{^a}', 0xab:'<<', 0xac:'{not}', #0xad:'-', 0xae:'{R}', 0xaf:'_', 0xb0:'{degrees}', #0xb1:'{+/-}', 0xb2:'{^2}', 0xb3:'{^3}', 0xb4:"'", #0xb5:'{micro}', 0xb6:'{paragraph}', 0xb7:'*', 0xb8:'{cedilla}', #0xb9:'{^1}', 0xba:'{^o}', 0xbb:'>>', #0xbc:'{1/4}', 0xbd:'{1/2}', 0xbe:'{3/4}', 0xbf:'?', #0xd7:'*', 0xf7:'/' } r = '' for i in unicrap: if xlate.has_key(ord(i)): r += xlate[ord(i)] elif ord(i) >= 0x80: pass else: r += str(i) return r def create_dataset_resource(self, url='', format='', hash='', description=''): return { 'url': url, 'format': format, 'hash': hash, 'description': description, }